library(tidyverse)
library(tidymodels)
library(skimr)
library(lubridate)
Project-Elegant-Charmander
Exploratory data analysis
Research question(s)
- How does average score differ across genre? Across time? What about when viewing these traits in tandem?
- How does pitchfork differ from Anthony Fantano in the scores it gives? How is one music blogger’s work different from that of a larger publication?
Data collection and cleaning
<- read_csv("data/pitchfork.csv") pitchfork_raw
Rows: 20873 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): artist, album, genre, date, author, role, review, link, label
dbl (3): score, bnm, release_year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(pitchfork_raw)
Rows: 20,873
Columns: 12
$ artist <chr> "David Byrne", "DJ Healer", "Jorge Velez", "Chandra", "Th…
$ album <chr> "“…The Best Live Show of All Time” — NME EP", "Lost Loves…
$ genre <chr> "Rock", "Electronic", "Electronic", "Rock", "Electronic",…
$ score <dbl> 5.5, 6.2, 7.9, 7.8, 3.1, 7.8, 6.8, 7.3, 7.4, 7.7, 9.4, 7.…
$ date <chr> "January 11 2019", "January 11 2019", "January 10 2019", …
$ author <chr> "Andy Beta", "Chal Ravens", "Philip Sherburne", "Andy Bet…
$ role <chr> "Contributor", "Contributor", "Contributing Editor", "Con…
$ review <chr> "Viva Brother, Terris, Mansun, the Twang, Joe Lean & the …
$ bnm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ link <chr> "https://pitchfork.com/reviews/albums/david-byrne-the-bes…
$ label <chr> "Nonesuch", "Planet Uterus", "Self-released", "Telephone …
$ release_year <dbl> 2018, 2019, 2019, 2018, 2018, 2018, 2018, 2018, 2018, 201…
<- read_csv("data/fantano_scores.csv") fantano_raw
Rows: 1272 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): album, artist, genre, name, score, tags, url
dttm (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(fantano_raw)
Rows: 1,272
Columns: 8
$ album <chr> "Planet Her", "Butterfly 3000", "Black Metal 2", "The Life of P…
$ artist <chr> "Doja Cat", "King Gizzard & The Lizard Wizard", "Dean Blunt…
$ date <dttm> 2021-06-30 19:33:00, 2021-06-18 03:33:00, 2021-06-18 03:35:00,…
$ genre <chr> "hip hop", "rock", "other", "hip hop", "hip hop", "pop", "hip h…
$ name <chr> "Doja Cat - Planet Her", "King Gizzard & The Lizard Wizard …
$ score <chr> "5", "5", "7", "5", "6", "6", "8", "8", "6", "7", "2", "5", "6"…
$ tags <chr> "doja cat | planet her | 2021 | album | kemosabe | rap | hip ho…
$ url <chr> "https://www.theneedledrop.com/articles/2021/6/doja-cat-planet-…
<- fantano_raw |>
fantano_raw1 # Extract data and convert to correct format
separate(date, into = c("date", "time"), sep = " ") |>
mutate(date = as.Date(date, format = "%Y-%m-%d")) |>
# convert score to numeric
mutate(score = parse_number(score)) |>
mutate(score = ifelse(score > 10, 10, score)) |>
# Keep the six most important tags
separate(tags, into = c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6"), sep = "\\|") |>
select(-time, -url) |>
# Delete rows with NAs
na.omit() |>
arrange(desc(score))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `score = parse_number(score)`.
Caused by warning:
! 5 parsing failures.
row col expected actual
969 -- a number case
1268 -- a number lazer
1269 -- a number alternate
1271 -- a number mono
1272 -- a number mono
Warning: Expected 6 pieces. Additional pieces discarded in 1272 rows [1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
<- fantano_raw1 |>
num_in_each_genre group_by(genre) |>
summarize(n = n())
<- fantano_raw1 |>
fantano_raw2 left_join(num_in_each_genre, by = "genre")
|>
fantano_raw1 group_by(genre) |>
summarize(n = n()) |>
filter(n >= 10)
# A tibble: 7 × 2
genre n
<chr> <int>
1 electronic 72
2 hip hop 398
3 loud rock 111
4 other 77
5 pop 263
6 rock 218
7 sub pop 14
# Merge genre with less than 10 songs with other genres
<- fantano_raw2 |>
fantano_raw2 mutate(genre = ifelse(n < 10 & grepl("electronic", genre), "electronic", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("hip hop", genre), "hip hop", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("loud rock", genre), "loud rock", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("pop", genre), "pop", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("rock", genre), "rock", genre)) |>
mutate(genre = ifelse(n < 10 & grepl("sub pop", genre), "sub pop", genre)) |>
mutate(genre = ifelse(n < 10 & !grepl("electronic", genre) & !grepl("hip hop", genre) & !grepl("loud rock", genre) & !grepl("pop", genre) & !grepl("rock", genre) & !grepl("sub pop", genre), "other", genre))
# Put other at last since it does not provide useful information
$genre <- factor(fantano_raw2$genre, levels = c("electronic", "hip hop", "loud rock", "pop", "rock", "sub pop", "other")) fantano_raw2
<- fantano_raw2 |>
fantano_cleaned select(-n, -name)
# Delete repetitive labels
<- pitchfork_raw|>
pitchfork_raw1 separate_rows(label, sep = ",") |>
group_by(album) |>
summarise(label = paste(unique(label), collapse = ","))
<- pitchfork_raw|>
pitchfork_raw2 select(-link, - review)|>
# Extract date and convert to correct format
separate(date, into = c("review_month", "review_day","review_year"), sep = " ")|>
mutate(
review_month = case_when(
== "January" ~ "01",
review_month == "February" ~ "02",
review_month == "March" ~ "03",
review_month == "April" ~ "04",
review_month == "May" ~ "05",
review_month == "June" ~ "06",
review_month == "July" ~ "07",
review_month == "August" ~ "08",
review_month == "September" ~ "09",
review_month == "October" ~ "10",
review_month == "November" ~ "11",
review_month == "December" ~ "12"),
review_month review_day = as.numeric(review_day),
review_month = as.numeric(review_month),
review_year = as.numeric(review_year),
date = as.Date(with(pitchfork_raw, paste(review_year, review_month,
sep="-")), "%Y-%m-%d"),
review_day,# Compare the release and review year
is_retrospective = if_else(release_year == review_year, "no", "yes"))|>
# Exclude irrelevant variables
select(-review_month, -review_day,-label)
# Extract data and convert to correct format
<- inner_join(x = pitchfork_raw1, y = pitchfork_raw2,
pitchfork_cleaned by = c("album"))|>
# Delete rows with NAs
na.omit() |>
arrange(desc(score))|>
# Keep the six most important tags
separate(label, into = c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6"), sep = "\\,")
Warning: Expected 6 pieces. Additional pieces discarded in 46 rows [215, 1949,
2291, 2473, 2474, 2742, 2775, 3453, 5467, 5468, 5530, 6082, 6145, 6201, 6879,
6881, 7547, 7603, 8160, 8923, ...].
Warning: Expected 6 pieces. Missing pieces filled with `NA` in 15214 rows [1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
pitchfork_cleaned
# A tibble: 15,273 × 17
album tag1 tag2 tag3 tag4 tag5 tag6 artist genre score review_year
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 "\"Heroes… RCA <NA> <NA> <NA> <NA> <NA> David… Rock 10 2016
2 "1999" Warn… NA Astr… <NA> <NA> <NA> Prince Pop/… 10 2016
3 "A Love S… Verve <NA> <NA> <NA> <NA> <NA> John … Jazz 10 2015
4 "Abbey Ro… EMI <NA> <NA> <NA> <NA> <NA> The B… Rock 10 2009
5 "Another … Isla… <NA> <NA> <NA> <NA> <NA> Brian… Expe… 10 2016
6 "Appetite… Geff… <NA> <NA> <NA> <NA> <NA> Guns … Rock 10 2017
7 "Astral W… Warn… <NA> <NA> <NA> <NA> <NA> Van M… Rock 10 2015
8 "Bach: Th… CBS Colu… <NA> <NA> <NA> <NA> Glenn… Jazz 10 2017
9 "Blood on… Colu… <NA> <NA> <NA> <NA> <NA> Bob D… Rock 10 2016
10 "Born to … Colu… <NA> <NA> <NA> <NA> <NA> Bruce… Rock 10 2005
# ℹ 15,263 more rows
# ℹ 6 more variables: author <chr>, role <chr>, bnm <dbl>, release_year <dbl>,
# date <date>, is_retrospective <chr>
Data description
What are the observations (rows) and the attributes (columns)?
fantano: each row represents an album review with below attributes.
album: <char> The album in question.
artist: <char> The artist that made this album.
date: <date> Date for which this review was published.
genre: <char, categorical> Genre of the album, e.g. electronic, hip hop, etc.
score: <double> Score given by Fantano regarding this album, ranging from 1~10.
tag1-6: <char> Six main tags given to the album by Fantano, some describe the theme, other describing the genre and year the song was written.
Pitchfork: Each row represents an album review with below attributes.
album: the name of the album in question.
tags 1-6: Up to 6 tags associated with this album.
artist: The artist responsible for the album.
genre: The genre of the album, e.g. hip-hop or rock.
score: The score the album was given in this review.
review_year: The year the review was published.
author: The writer of the album review.
role: The author’s position within Pitchfork.
bnm: This is 1 if the album was awarded a “bnm,” or “best new music,” and 0 otherwise. Pitchfork awards bnms to the albums they feel are the best or most essential each year.
release_year: The year the album in question was released.
date: The date the review was published.
is_retrospective: Is “yes” if the album was reviewed in a year after it was published, and “no” otherwise.
Why was this dataset created?
Fantano: It was scraped from his blog website by kaggle.com user Apatosaur. They do not explicit write the reason for creating the dataset.
Pitchfork: It was scraped from Pitchfork’s website by components.one user Andrew Thompson. The reason for the creation of the dataset is unknown.
Who funded the creation of the dataset?
Fantano is from a music youtuber / blogger named Anthony Fantano, who has been reviewing music since the early 2010s. It was scraped from his blog website by kaggle.com user Apatosaur. Details can be found here: https://www.kaggle.com/datasets/apat0saur/theneedledrop-reviews?resource=download
Pitchfork: Nobody funded this dataset, though Thompson does credit one Nolan Conway for the idea.
What processes might have influenced what data was observed and recorded and what was not?
Fantano: Anthony Fantano’s personal music taste may influence what songs he decided to write reviews on. He may be writing songs of genres he likes more frequently.
Pitchfork: Thompson last updated this dataset in 2021, so more recent reviews will not be included.
What preprocessing was done, and how did the data come to be in the form that you are using?
The raw data is from the kaggle.com website. To transform into clean version, we did below steps:
1. Separate date to date and time and drop the time column as it does not provide useful info;
2. Correct data format into date, numeric, etc. based on what the data represents;
3. Omit rows containing NA (there are only a few);
4. Since there are originally 67 genres which will make classification difficult, so merged it into 7 genres by merging, for example, “conscious hip hop” to “hip hop”, and create factor for genre
- Pitchfork: For the pitchfork dataset, changes are made to the label and date. Repetitive labels are deleted and the column was separated into six with one tag in each cell. The “date” column is converted from string to date format. A separate column is created for the review year to compare it with the review year, and a new column that shows whether the review is retrospective is added. Unrelated variables - “link” and “review” - are removed. After tidying the dataset, pitchfork can be joined with fantano without conflicts in variable format.
If people are involved, were they aware of the data collection and if so, what purpose did they expect the data to be used for?
Fantano: Only artist names are involved; they may be expect their name to appear along with the songs for review, since publication of a song means it’s going to go public and face some judgement.
Pitchfork: To my knowledge the authors and artist are not aware that this data has been collected. Given that it is all displayed professionally for mass consumption on a website, however, and involves no personal data, I doubt they would mind using these reviews as analysis.
Data limitations
For fantano_cleaned, since it’s from a music youtuber / blogger named Anthony Fantano who has been reviewing music since the early 2010s, it may not be representative of public taste of US people: every rows of review is wrote by the same person Anthony Fantano. On the other hand, pitchfork is a very famous music publication, so it is more representative of public opinions. Also, pitchfork and fantano don’t review exactly the same albums, so It will be meaningless to directly combine them; but it would be useful to make a contrast between them on common albums they have.
Another limitation is that fantano_cleaned mainly focused on modern music (the earliest data is 2013-03-26), so does not provide a comprehensive view of music taste earlier than 2013. With that said, it is sufficient for our use since we are interested in investigating the music taste changes in the recent 10 years.
Additionally, during exploratory data analysis, we found it difficult to analyze non-numeric columns such as “review” in pitchfork and “album” and “artist” in both data frames. Because R Studio is not good at analyzing string patterns, our analysis may be based on categorical and numeric variables for most of the parts.
|>
fantano_cleaned arrange(date)
# A tibble: 1,267 × 11
album artist date genre score tag1 tag2 tag3 tag4 tag5 tag6
<chr> <chr> <date> <fct> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 The 20/20 … Justi… 2013-03-26 pop 10 "201… " da… " ju… " mi… " mu… " po…
2 A Special … Open … 2015-02-06 hip … 8 "201… " a … " ep… " hi… " op… " ra…
3 Xe Zs 2015-02-06 rock 8 "201… " al… " dr… " ex… " fr… " ja…
4 Lost Themes John … 2015-02-06 elec… 7 "alb… " ho… " jo… " lo… " pr… " RE…
5 To Pimp A … Kendr… 2015-06-20 hip … 10 "ken… " to… " 20… " 10… " al… " to…
6 The Epic Kamas… 2015-06-20 other 9 "kam… " th… " al… " 20… " 9/… " po…
7 The Powers… Death… 2015-06-20 hip … 9 "dea… " th… " ni… " je… " th… " al…
8 Citizen Zo… The P… 2015-06-20 pop 9 "uk " " th… " ci… " 9/… " al… " 20…
9 Painted Sh… Hop A… 2015-06-20 rock 8 "hop… " pa… " al… " 20… " sa… " ro…
10 Frozen Nia… Pruri… 2015-06-20 other 8 "pru… " fr… " al… " 20… " pr… " ot…
# ℹ 1,257 more rows
Exploratory data analysis
Perform an (initial) exploratory data analysis.
fantano_cleaned
# A tibble: 1,267 × 11
album artist date genre score tag1 tag2 tag3 tag4 tag5 tag6
<chr> <chr> <date> <fct> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 Spare Ribs Sleaf… 2021-03-19 hip … 10 "sle… " sp… " 20… " al… " uk… " ro…
2 You Won't … Daugh… 2018-11-01 loud… 10 "dau… " yo… " 20… " ip… " lo… " no…
3 KIDS SEE G… Kids … 2018-06-11 hip … 10 "kid… " se… " 20… " g.… " ka… " ki…
4 To Pimp A … Kendr… 2015-06-20 hip … 10 "ken… " to… " 20… " 10… " al… " to…
5 The 20/20 … Justi… 2013-03-26 pop 10 "201… " da… " ju… " mi… " mu… " po…
6 Daddy's Ho… St. V… 2021-05-18 pop 9 "st.… " da… " 20… " al… " lo… " po…
7 Promises Float… 2021-04-12 other 9 "flo… " ph… " lo… " ls… " al… " 20…
8 Plastic Be… Goril… 2021-01-12 pop 9 "gor… " pl… " al… " 20… " 20… " re…
9 Windswept … Ichik… 2020-12-09 other 9 "ich… " wi… " ad… " al… " 20… " he…
10 Visions of… clipp… 2020-10-28 sub … 9 "cli… " vi… " 20… " al… " su… " ra…
# ℹ 1,257 more rows
|>
fantano_cleaned group_by(genre) |>
summarize(score = score,
album_number = n()) |>
ggplot(aes(x = score, y = genre)) +
geom_boxplot() +
labs(
title = "Fantano Album Scores by Genre",
x = "Score (1 - 10)",
y = "Genre"
+
) theme_minimal()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
`summarise()` has grouped output by 'genre'. You can override using the
`.groups` argument.
<- pitchfork_cleaned |>
year_score_sum group_by(release_year) |>
summarize(avgscore = mean(score))
|>
year_score_sum ggplot(aes(x = release_year, y = avgscore)) +
geom_line() +
theme_minimal() +
geom_vline(xintercept = 1997) +
labs(
title = "Average album rating based on album release year",
subtitle = "Vertical line when pitchfork starts releasing reviews",
x = "Year of album release",
y = "Average score among albums released that year"
)
<- pitchfork_cleaned |>
year_score_sum_m group_by(release_year, is_retrospective) |>
summarize(avgscore = mean(score))
`summarise()` has grouped output by 'release_year'. You can override using the
`.groups` argument.
year_score_sum_m
# A tibble: 80 × 3
# Groups: release_year [60]
release_year is_retrospective avgscore
<dbl> <chr> <dbl>
1 1957 yes 8.95
2 1960 yes 9
3 1962 yes 8.2
4 1963 yes 9.43
5 1964 yes 9.34
6 1965 yes 8.96
7 1966 yes 8.52
8 1967 yes 8.68
9 1968 yes 8.39
10 1969 yes 8.78
# ℹ 70 more rows
|>
year_score_sum_m ggplot(aes(x = release_year, y = avgscore, color = is_retrospective, group = is_retrospective)) +
geom_line() +
theme_minimal() +
xlim(1996, 2020) +
labs(
title = "Average album rating based on album release year",
subtitle = "Comparing retrospective and modern reviews",
x = "Year of album release",
y = "Average score among albums released that year",
color = "Is the review retrospective?"
+
) theme(
legend.position = "bottom",
legend.direction = "horizontal"
)
Warning: Removed 36 rows containing missing values (`geom_line()`).
<- fantano_cleaned |>
fantano_new mutate(year = year(date)) |>
select(genre, score, year) |>
group_by(genre, year) |>
filter(genre == "hip hop") |>
group_by(year) |>
summarize(average_score = mean(score)) |>
filter(average_score != "NA")
ggplot(fantano_new, mapping = aes(x = year, y = average_score)) +
geom_point() +
geom_line() +
labs (
title = "Average Score of Hip Hop Album per Year",
x = "Year",
y = "Average Score",
+
) theme_minimal()
Questions for reviewers
List specific questions for your peer reviewers and project mentor to answer in giving you feedback on this phase.
Are there any other relationships that you would like to see us analyze?
Are there any improvements that we can make to our data sets or graphs?
Are there any columns in our datasets that you don’t think we’ll need?
<- pitchfork_cleaned |>
pitchfork_2genre filter(genre %in% c("Folk/Country", "Rock")) |>
select(genre, score, release_year) |>
na.omit()
<- pitchfork_2genre |>
pitchfork_summary group_by(genre, release_year) |>
summarize(n = n(), mean_score = mean(score))
`summarise()` has grouped output by 'genre'. You can override using the
`.groups` argument.
|>
pitchfork_summary ggplot(aes(x = release_year, y = mean_score, color = genre)) +
geom_line() +
theme_minimal() +
labs(
title = "Average album rating based on album release year",
subtitle = "Vertical line when pitchfork starts releasing reviews",
x = "Year of album release",
y = "Average score among albums released that year"
+
) facet_wrap(~genre)