library(tidyverse)
library(readxl)
<- read_excel("data/anaemia-estimates_inputdata_final.xlsx",
anaemia_estimates_inputdata_final sheet = "data")
<- c("Albania", "Armenia", "Azerbaijan", "Georgia", "North Macedonia", "Republic of Moldova", "Montenegro", "Romania", "Serbia", "Ukraine", "Egypt", "Morocco", "Tunisia", "Angola", "Benin", "Burkina Faso", "Burundi", "Cameroon", "Cabo Verde", "Central African Republic", "Chad", "Congo", "Democratic Republic of the Congo", "Cote d'Ivoire", "Equatorial Guinea", "Eswatini", "Ethiopia", "Gabon", "Gambia", "Ghana", "Guinea", "Kenya", "Lesotho", "Liberia", "Madagascar", "Malawi", "Mali", "Mauritius", "Mozambique", "Namibia", "Niger", "Nigeria", "Rwanda", "Sao Tome and Principe", "Senegal", "Sierra Leone", "Somalia", "South Africa", "Sudan", "United Republic of Tanzania", "Togo", "Uganda", "Zambia", "Zimbabwe", "Belize", "Costa Rica", "Cuba", "Dominica", "Dominican Republic", "El Salvador", "Guatemala", "Haiti", "Honduras", "Mexico", "Nicaragua", "Panama", "Argentina", "Bolivia (Plurinational State of)", "Brazil", "Colombia", "Ecuador", "Guyana", "Peru", "Afghanistan", "Bangladesh", "Bhutan", "Cambodia", "China", "India", "Indonesia", "Kazakhstan", "Democratic People's Republic of Korea", "Kyrgyzstan", "Lao People's Democratic Republic", "Malaysia", "Maldives", "Mongolia", "Myanmar", "Nepal", "Pakistan", "Philippines", "Sri Lanka", "Tajikistan", "Thailand", "Turkmenistan", "Uzbekistan", "Viet Nam", "Iran (Islamic Republic of)", "Iraq", "Jordan", "Lebanon", "Occupied Palestinian Territory", "Yemen", "Fiji", "Marshall Islands", "Nauru", "Papua New Guinea", "Samoa", "Solomon Islands", "Tuvalu", "Vanuatu", "Antigua and Barbuda", "Bahrain", "Central African Republic", "Micronesia (Federated States of)", "Oman", "Qatar", "Timor-Leste")
developing_countries <- anaemia_estimates_inputdata_final |>
anaemia_estimates_csv filter(sex == 2) |>
select(member_state, beginyear, sex, agerange, pregnancy, samplesize, mean, below130, below120, below110, below115, below100, below90, below80, below70) |>
mutate(
country_classification = if_else(member_state %in% developing_countries, "Developing", "Developed")
)
write_csv(anaemia_estimates_csv, file = "data/final-anemia.csv")
<- read_csv("data/final-anemia.csv") anaemia_estimates_csv_final
Anemia in Women
Appendix to report
Data cleaning
The original dataset was sourced from the WHO website. No web scraping or surveys was required nor performed in the gathering of this dataset; data was downloaded as an excel file and uploaded to RStudio.
However, in the context of our research questions, modifications to and preprocessing of the data was completed in order to extract data that was valuable and specific to our investigation. In particular, given that our research questions focused on women, we first filtered data to include include only women from the sex
variable in our data set. We then selected only columns relevant to our research questions (i.e. member_state
, beginyear
, sex
, agerange
, pregnancy
, samplesize
, mean
, below130
, below120
, below110
, below115
, below100
, below90
, below80
, below70
). In addition, we created a vector developing_countries
vector specifying developing countries in our dataset (based on the list of developing countries as declared by the Minister for Foreign Affairs, cross-checked with searches from the World Bank’s website). In turn, we mutated our dataset to create another column, country_classification
, which indicates whether a country in the dataset was a developing or developed country based on whether the country was shown in the developing_countries
vector. Our final dataset includes 16 columns (excluding threshold “belowX
” columns, 9 unique columns). Each observation within our dataset includes the recorded values from a sample of women surveyed, grouped based on country, surveyed year, age range of the sample, and pregnancy status.