UFO Sightings

Appendix to report

Data cleaning

  1. We started by cleaning all the names in the dataset using the janitor package and making them more readable.
  2. We filtered to include only the required columns.
  3. Then we merged the two datasets using the common column names they had.
  4. We recreated the date column to be in the format we wanted.
  5. We also changed our month column to be in abbreviation version rather than numbered.
  6. Our duration column had a lot of varying types of inputs so we created a function to clean this data to only include minutes. However, later we did not end up using this column.
  7. We also cleaned our year column to reflect the correct century - eg. some of them said 23 instead of 2023.
library(tidyverse)
library(skimr)
library(openxlsx)
library(scales)
library(janitor)
library(viridis)
library(dbscan)
ufo1 <- read_csv("data/nuforc_events.csv")
Rows: 110265 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): City, State, Shape, Duration, Summary, Event_URL
dbl  (5): Year, Month, Day, Hour, Minute
dttm (1): Event_Time
date (1): Event_Date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ufo2 <- read_csv("data/ufo_data_nuforc.csv")
Rows: 1317 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): posted, date, city, state, shape, duration, summary, images, img_link
dbl  (3): lat, lng, population
time (1): time

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#cleaning the names before cleaning dataset 
ufo1_cleaned <- ufo1 |> 
  janitor::clean_names() |> 
  select(-event_time, -event_date) 

ufo2_cleaned <- ufo2 |> 
  janitor::clean_names() |> 
  select(-posted) |> 
  mutate(date = as.Date(date, "%m/%d/%Y")) |> 
  mutate(hour = hour(time), 
         minute = minute(time), 
         year = year(date), 
         month = month(date), 
         day = day(date))

#selecting only needed columns 
ufo2_cleaned <- ufo2_cleaned |> 
  select(-date, -time)

ufo2_cleaned <- ufo2_cleaned |> 
  mutate(year = ifelse(year > 23, paste0("19", year), paste0("20", year))) |> 
  mutate(year = as.numeric(year))

common_columns <- c("year", "month", "day", "hour", "minute", "city", "state", "shape", "duration", "summary") 

#merging 
merged <- bind_rows(ufo1_cleaned, ufo2_cleaned)

#recreating date 
merged$Date <- as.Date(paste
                       (merged$month, merged$day, merged$year, sep = "/"), 
                       format = "%m/%d/%Y")

#changing the month values to be abbrevs instead
merged <-merged |>
  mutate(month = recode(month, 
                        "01" = "Jan",
                        "02" = "Feb",
                        "03" = "Mar",
                        "04" = "Apr",
                        "05" = "May",
                        "06" = "June",
                        "07" = "July",
                        "08" = "Aug",
                        "09" = "Sep",
                        "10" = "Oct",
                        "11" = "Nov",
                        "12" = "Dec"))

#removing different variations of minutes in the duration column
remove_minutes_variations <- function(duration) {
  minutes_variations <- c("min", "mins", "minutes", "minute")
  duration <- gsub(paste(minutes_variations, collapse = "|"), "", duration, ignore.case = TRUE)
  return(duration)
}

#creating a function to turn all values in duration to minutes
to_minutes <- function(duration) {
  if (grepl("[~><-]", duration)) {
    return(duration)  # Keep such values as they are
  } else {
    # Extracting numeric values and their units
    numeric_value <- as.numeric(gsub("[^0-9]+", "", duration))
    unit <- tolower(gsub("[0-9]+", "", duration))
    
    # Use the remove_minutes_variations function to remove variations of "minutes"
    duration <- remove_minutes_variations(duration)
    
      # Converting all other values to minutes
      if (grepl("sec | Sec | secs | Secs | Seconds | Second | seconds | second", unit)) {
        return(round(numeric_value / 60, 3))
      } else if (grepl("hours | hour | Hour | Hours ", unit)) {
        return(round(numeric_value * 60, 3))
      } else {
        return(duration)  # Retaining strings and others as is
      }
    }
}

# Apply the to_minutes function to the 'duration' column of your dataframe
merged$duration <- sapply(merged$duration, to_minutes)
write.csv(merged, "~/fa23_project/project-elegant-charmander/data/ufo_merged_data.csv", row.names = FALSE) 

Other appendicies (as necessary)