library(tidyverse)
library(skimr)
library(janitor)
library(tidymodels)
US Mental Health
Appendix to report
Data cleaning
<- read_csv("data/Final_Data/AGE x MH1.csv") age_mh1
Rows: 240 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Age (recoded), Mental health diagnosis one, Total % SE, Total % CI...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Cleaning data for Age and Mental Health diagnosis
<- age_mh1|>
age_mh1 clean_names()|>
select(age_recoded,mental_health_diagnosis_one,unweighted_count)|>
filter(age_recoded!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(age_recoded!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(age_mh1))|>
rename(
age_count=unweighted_count
)write_csv(age_mh1,file = "data/Final_Data/Clean_Data/AGE_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected age_recoded,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all age_recorded = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all age_recoded =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to age_count to reflect that dataset deals with ages of persons who took the survey
<- read_csv("data/Final_Data/EDUC x MH1.csv") educ_mh1
Rows: 105 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Education, Mental health diagnosis one, Total % SE, Total % CI (lo...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- educ_mh1 |>
educ_mh1 clean_names()|>
select(education,mental_health_diagnosis_one,unweighted_count)|>
select(education,mental_health_diagnosis_one,unweighted_count)|>
filter(education!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(education!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(educ_mh1))|>
rename(
edu_count=unweighted_count
)
write_csv(educ_mh1,file = "data/Final_Data/Clean_Data/EDUC_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected education,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all education = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all education =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to educ_count to reflect that dataset deals with education levels of persons who took the survey
<- read_csv("data/Final_Data/ETHNIC x MH1.csv") ethn_mh1
Rows: 90 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Hispanic or Latino origin (ethnicity), Mental health diagnosis one...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- ethn_mh1|>
ethn_mh1 clean_names()|>
rename(
ethnicity=hispanic_or_latino_origin_ethnicity
|>
)select(ethnicity,mental_health_diagnosis_one,unweighted_count)|>
select(ethnicity,mental_health_diagnosis_one,unweighted_count)|>
filter(ethnicity!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(ethnicity!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(ethn_mh1))|>
rename(
ethn_count=unweighted_count
)
write_csv(ethn_mh1,file = "data/Final_Data/Clean_Data/ETHN_MH1_CLEAN.csv")
- The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
2.Renamed hispanic_or_latino_origin_ethnicity to ethnicity
selected ethnicity,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all ethnicity = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all ethnicity =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to ethn_count to reflect that dataset deals with ethnicity of persons who took the survey
<- read_csv("data/Final_Data/GENDER x MH1.csv") gen_mh1
Rows: 60 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Sex, Mental health diagnosis one, Total % SE, Total % CI (lower), ...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- gen_mh1 |>
gen_mh1 clean_names()|>
select(sex,mental_health_diagnosis_one,unweighted_count)|>
select(sex,mental_health_diagnosis_one,unweighted_count)|>
filter(sex!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(sex!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(gen_mh1))|>
rename(
sex_count=unweighted_count
)
write_csv(gen_mh1,file = "data/Final_Data/Clean_Data/GENDER_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected sex,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all sex = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all sex =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to sex_count to reflect that dataset deals with the gender of persons who took the survey
<- read_csv("data/Final_Data/RACE x MH1.csv") race_mh1
Rows: 120 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Race, Mental health diagnosis one, Total % SE, Total % CI (lower),...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- race_mh1 |>
race_mh1 clean_names()|>
select(race,mental_health_diagnosis_one,unweighted_count)|>
filter(race!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(race!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(race_mh1))|>
rename(
race_count=unweighted_count
)
write_csv(race_mh1,file = "data/Final_Data/Clean_Data/RACE_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected race,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all race = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all race =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to race_count to reflect that dataset deals with the different races of persons who took the survey
<- read_csv("data/Final_Data/STATEFIP x MH1.csv") stat_mh1
Rows: 750 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Reporting state code, Mental health diagnosis one, Total % SE, Tot...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- stat_mh1 |>
stat_mh1 clean_names()|>
select(reporting_state_code,mental_health_diagnosis_one,unweighted_count)|>
filter(reporting_state_code!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(reporting_state_code!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(stat_mh1))|>
rename(
stat_count=unweighted_count
)
write_csv(stat_mh1,file = "data/Final_Data/Clean_Data/STATE_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected reporting_state_code,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all reporting_state_code = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all reporting_state_code =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to stat_count to reflect that dataset deals with different states the persons who took the survey came from
<- read_csv("data/Final_Data/SUB x MH1.csv") sub_mh1
Rows: 225 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Substance use diagnosis, Mental health diagnosis one, Total % SE, ...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- sub_mh1 |>
sub_mh1 clean_names()|>
select(substance_use_diagnosis,mental_health_diagnosis_one,unweighted_count)|>
filter(substance_use_diagnosis!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(substance_use_diagnosis!="Missing/unknown/not collected/invalid/no or deferred diagnosis")|>
slice(2:nrow(sub_mh1))|>
rename(
sub_count=unweighted_count
)
write_csv(sub_mh1,file = "data/Final_Data/Clean_Data/SUB_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected substance_use_diagnosis,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all substance_use_diagnosis = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all substance_use_diagnosis =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to sub_count to reflect that dataset deals with different subtances the persons who took the survey may have abused
<- read_csv("data/Final_Data/VETERAN x MH1.csv") vet_mh1
Rows: 60 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): Veteran status, Mental health diagnosis one, Total % SE, Total % C...
dbl (4): Total %, Row %, Column %, Unweighted Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- vet_mh1 |>
vet_mh1 clean_names()|>
select(veteran_status,mental_health_diagnosis_one,unweighted_count)|>
filter(veteran_status!="Overall")|>
filter(mental_health_diagnosis_one!="Overall")|>
filter(veteran_status!="Missing/unknown/not collected/invalid")|>
slice(2:nrow(vet_mh1))|>
rename(
vet_count=unweighted_count
)
write_csv(vet_mh1,file = "data/Final_Data/Clean_Data/VET_MH1_CLEAN.csv")
The first thing that is done to the data after it has been read, is the name of the columns are converted to follow the appropriate syntax
selected veteran_status,mental_health_diagnosis_one,unweighted_count from columns
Filtered out all veteran_status = to Overall
Filtered out all mental_health_diagnosis_one =Overall
Filtered out all veteran_status =Missing/unknown/not collected/invalid
Removed first row to get full set of data
Changed unweight_count to vet_count to reflect that dataset deals with different the people are veterans that took the survey
Other appendicies (as necessary)
# convert substance frequency table to data frame
<- read_csv("data/Final_Data/Clean_Data/SUB_MH1_CLEAN.csv") sub_freq
Rows: 181 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): substance_use_diagnosis, mental_health_diagnosis_one
dbl (1): sub_count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- sub_freq[rep(1:nrow(sub_freq), sub_freq$sub_count), -3]
sub_df write_csv(sub_df, file = "data/Final_Data/Clean_Data/SUB_MH1_DF.csv")
# convert education frequency table to data frame
<- read_csv("data/Final_Data/Clean_Data/EDUC_MH1_CLEAN.csv") ed_freq
Rows: 69 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): education, mental_health_diagnosis_one
dbl (1): edu_count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- ed_freq[rep(1:nrow(ed_freq), ed_freq$edu_count), -3]
ed_df write_csv(ed_df, file = "data/Final_Data/Clean_Data/EDUC_MH1_DF.csv")
# convert race frequency table to data frame
<- read_csv("data/Final_Data/Clean_Data/RACE_MH1_CLEAN.csv") race_freq
Rows: 83 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): race, mental_health_diagnosis_one
dbl (1): race_count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- race_freq[rep(1:nrow(race_freq), race_freq$race_count), -3]
race_df write_csv(race_df, file = "data/Final_Data/Clean_Data/RACE_MH1_DF.csv")
# convert substane frequency table to data frame
<- read_csv("data/Final_Data/Clean_Data/VET_MH1_CLEAN.csv") vet_freq
Rows: 27 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): veteran_status, mental_health_diagnosis_one
dbl (1): vet_count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- vet_freq[rep(1:nrow(vet_freq), vet_freq$vet_count), -3]
vet_df write_csv(vet_df, file = "data/Final_Data/Clean_Data/VET_MH1_DF.csv")