library(tidyverse)
library(skimr)
library(lubridate)
An Analysis of Modern Music Criticism
Appendix to report
Data cleaning
Fantano data set:
<- fantano_raw |>
fantano_raw1 # Extract data and convert to correct format
separate(date, into = c("date", "time"), sep = " ") |>
mutate(date = as.Date(date, format = "%Y-%m-%d")) |>
# convert score to numeric
mutate(score = parse_number(score)) |>
mutate(score = ifelse(score > 10, 10, score)) |>
# Keep the six most important tags
separate(tags, into = c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6"), sep = "\\|") |>
select(-time, -url) |>
# Delete rows with NAs
na.omit() |>
arrange(desc(score))
<- fantano_raw1 |>
num_in_each_genre group_by(genre) |>
summarize(n = n())
<- fantano_raw1 |>
fantano_raw2 left_join(num_in_each_genre, by = "genre")
# Merge genre with less than 10 songs with other genres
<- fantano_raw2 |>
fantano_raw2 mutate(genre = ifelse(n < 10 & grepl("electronic", genre), "electronic", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("hip hop", genre), "hip hop", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("loud rock", genre), "loud rock", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("pop", genre), "pop", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("rock", genre), "rock", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("sub pop", genre), "sub pop", genre)) |>
mutate(genre = ifelse(n < 10 & !grepl("electronic", genre) & !grepl("hip hop", genre) & !grepl("loud rock", genre) & !grepl("pop", genre) & !grepl("rock", genre) & !grepl("sub pop", genre), "other", genre))
# Put other at last since it does not provide useful information
$genre <- factor(fantano_raw2$genre, levels = c("electronic", "hip hop", "loud rock", "pop", "rock", "sub pop", "other")) fantano_raw2
<- fantano_raw2 |>
fantano_cleaned select(-n, -name)
Pitchfork data set:
# Delete repetitive labels
<- pitchfork_raw|>
pitchfork_raw1 separate_rows(label, sep = ",") |>
group_by(album) |>
summarise(label = paste(unique(label), collapse = ","))
<- pitchfork_raw|>
pitchfork_raw2 select(-link, - review)|>
# Extract date and convert to correct format
separate(date, into = c("review_month", "review_day","review_year"), sep = " ")|>
mutate(
review_month = case_when(
== "January" ~ "01",
review_month == "February" ~ "02",
review_month == "March" ~ "03",
review_month == "April" ~ "04",
review_month == "May" ~ "05",
review_month == "June" ~ "06",
review_month == "July" ~ "07",
review_month == "August" ~ "08",
review_month == "September" ~ "09",
review_month == "October" ~ "10",
review_month == "November" ~ "11",
review_month == "December" ~ "12"),
review_month review_day = as.numeric(review_day),
review_month = as.numeric(review_month),
review_year = as.numeric(review_year),
date = as.Date(with(pitchfork_raw, paste(review_year, review_month,
sep="-")), "%Y-%m-%d"),
review_day,# Compare the release and review year
is_retrospective = if_else(release_year == review_year, "no", "yes"))|>
# Exclude irrelevant variables
select(-review_month, -review_day,-label)
# Joined pitchfork_raw1 and pitchfork_raw2 by album
<- inner_join(x = pitchfork_raw1, y = pitchfork_raw2,
pitchfork_cleaned by = c("album"))|>
# Delete rows with NAs
na.omit() |>
arrange(desc(score))|>
# Keep the six most important tags
separate(label, into = c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6"), sep = "\\,")
# export the cleaned datasets
write_csv(fantano_cleaned, file = "data/fantano-cleaned.csv")
write_csv(pitchfork_cleaned, file = "data/pitchfork-cleaned.csv")