library(tidyverse)
library(skimr)
library(openxlsx)
library(scales)
library(janitor)
library(viridis)
library(dbscan)UFO Sightings
Appendix to report
Data cleaning
- We started by cleaning all the names in the dataset using the janitor package and making them more readable.
- We filtered to include only the required columns.
- Then we merged the two datasets using the common column names they had.
- We recreated the date column to be in the format we wanted.
- We also changed our month column to be in abbreviation version rather than numbered.
- Our duration column had a lot of varying types of inputs so we created a function to clean this data to only include minutes. However, later we did not end up using this column.
- We also cleaned our year column to reflect the correct century - eg. some of them said 23 instead of 2023.
ufo1 <- read_csv("data/nuforc_events.csv")Rows: 110265 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): City, State, Shape, Duration, Summary, Event_URL
dbl (5): Year, Month, Day, Hour, Minute
dttm (1): Event_Time
date (1): Event_Date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ufo2 <- read_csv("data/ufo_data_nuforc.csv")Rows: 1317 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): posted, date, city, state, shape, duration, summary, images, img_link
dbl (3): lat, lng, population
time (1): time
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#cleaning the names before cleaning dataset
ufo1_cleaned <- ufo1 |>
janitor::clean_names() |>
select(-event_time, -event_date)
ufo2_cleaned <- ufo2 |>
janitor::clean_names() |>
select(-posted) |>
mutate(date = as.Date(date, "%m/%d/%Y")) |>
mutate(hour = hour(time),
minute = minute(time),
year = year(date),
month = month(date),
day = day(date))
#selecting only needed columns
ufo2_cleaned <- ufo2_cleaned |>
select(-date, -time)
ufo2_cleaned <- ufo2_cleaned |>
mutate(year = ifelse(year > 23, paste0("19", year), paste0("20", year))) |>
mutate(year = as.numeric(year))
common_columns <- c("year", "month", "day", "hour", "minute", "city", "state", "shape", "duration", "summary")
#merging
merged <- bind_rows(ufo1_cleaned, ufo2_cleaned)
#recreating date
merged$Date <- as.Date(paste
(merged$month, merged$day, merged$year, sep = "/"),
format = "%m/%d/%Y")
#changing the month values to be abbrevs instead
merged <-merged |>
mutate(month = recode(month,
"01" = "Jan",
"02" = "Feb",
"03" = "Mar",
"04" = "Apr",
"05" = "May",
"06" = "June",
"07" = "July",
"08" = "Aug",
"09" = "Sep",
"10" = "Oct",
"11" = "Nov",
"12" = "Dec"))
#removing different variations of minutes in the duration column
remove_minutes_variations <- function(duration) {
minutes_variations <- c("min", "mins", "minutes", "minute")
duration <- gsub(paste(minutes_variations, collapse = "|"), "", duration, ignore.case = TRUE)
return(duration)
}
#creating a function to turn all values in duration to minutes
to_minutes <- function(duration) {
if (grepl("[~><-]", duration)) {
return(duration) # Keep such values as they are
} else {
# Extracting numeric values and their units
numeric_value <- as.numeric(gsub("[^0-9]+", "", duration))
unit <- tolower(gsub("[0-9]+", "", duration))
# Use the remove_minutes_variations function to remove variations of "minutes"
duration <- remove_minutes_variations(duration)
# Converting all other values to minutes
if (grepl("sec | Sec | secs | Secs | Seconds | Second | seconds | second", unit)) {
return(round(numeric_value / 60, 3))
} else if (grepl("hours | hour | Hour | Hours ", unit)) {
return(round(numeric_value * 60, 3))
} else {
return(duration) # Retaining strings and others as is
}
}
}
# Apply the to_minutes function to the 'duration' column of your dataframe
merged$duration <- sapply(merged$duration, to_minutes)write.csv(merged, "~/fa23_project/project-elegant-charmander/data/ufo_merged_data.csv", row.names = FALSE)