Project-Elegant-Charmander

Exploratory data analysis

Research question(s)

How does average score differ across genre? Across time? What about when viewing these traits in tandem?
How does pitchfork differ from Anthony Fantano in the scores it gives? How is one music blogger’s work different from that of a larger publication?

Data collection and cleaning

library(tidyverse)
library(tidymodels)
library(skimr)
library(lubridate)

pitchfork_raw <- read_csv("data/pitchfork.csv")

Rows: 20873 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): artist, album, genre, date, author, role, review, link, label
dbl (3): score, bnm, release_year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(pitchfork_raw)

Rows: 20,873
Columns: 12
$ artist       <chr> "David Byrne", "DJ Healer", "Jorge Velez", "Chandra", "Th…
$ album        <chr> "“…The Best Live Show of All Time” — NME EP", "Lost Loves…
$ genre        <chr> "Rock", "Electronic", "Electronic", "Rock", "Electronic",…
$ score        <dbl> 5.5, 6.2, 7.9, 7.8, 3.1, 7.8, 6.8, 7.3, 7.4, 7.7, 9.4, 7.…
$ date         <chr> "January 11 2019", "January 11 2019", "January 10 2019", …
$ author       <chr> "Andy Beta", "Chal Ravens", "Philip Sherburne", "Andy Bet…
$ role         <chr> "Contributor", "Contributor", "Contributing Editor", "Con…
$ review       <chr> "Viva Brother, Terris, Mansun, the Twang, Joe Lean & the …
$ bnm          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ link         <chr> "https://pitchfork.com/reviews/albums/david-byrne-the-bes…
$ label        <chr> "Nonesuch", "Planet Uterus", "Self-released", "Telephone …
$ release_year <dbl> 2018, 2019, 2019, 2018, 2018, 2018, 2018, 2018, 2018, 201…

fantano_raw <- read_csv("data/fantano_scores.csv")

Rows: 1272 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (7): album, artist, genre, name, score, tags, url
dttm (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(fantano_raw)

Rows: 1,272
Columns: 8
$ album  <chr> "Planet Her", "Butterfly 3000", "Black Metal 2", "The Life of P…
$ artist <chr> "Doja Cat", "King Gizzard &amp; The Lizard Wizard", "Dean Blunt…
$ date   <dttm> 2021-06-30 19:33:00, 2021-06-18 03:33:00, 2021-06-18 03:35:00,…
$ genre  <chr> "hip hop", "rock", "other", "hip hop", "hip hop", "pop", "hip h…
$ name   <chr> "Doja Cat - Planet Her", "King Gizzard &amp; The Lizard Wizard …
$ score  <chr> "5", "5", "7", "5", "6", "6", "8", "8", "6", "7", "2", "5", "6"…
$ tags   <chr> "doja cat | planet her | 2021 | album | kemosabe | rap | hip ho…
$ url    <chr> "https://www.theneedledrop.com/articles/2021/6/doja-cat-planet-…

fantano_raw1 <- fantano_raw |> 
  # Extract data and convert to correct format
  separate(date, into = c("date", "time"), sep = " ") |>
  mutate(date = as.Date(date, format = "%Y-%m-%d")) |>
  # convert score to numeric
  mutate(score = parse_number(score)) |>
  mutate(score = ifelse(score > 10, 10, score)) |>
  # Keep the six most important tags
  separate(tags, into = c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6"), sep = "\\|") |>
  select(-time, -url) |>
  # Delete rows with NAs
  na.omit() |>
  arrange(desc(score))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `score = parse_number(score)`.
Caused by warning:
! 5 parsing failures.
 row col expected    actual
 969  -- a number case     
1268  -- a number lazer    
1269  -- a number alternate
1271  -- a number mono     
1272  -- a number mono

Warning: Expected 6 pieces. Additional pieces discarded in 1272 rows [1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

num_in_each_genre <- fantano_raw1 |>
  group_by(genre) |>
  summarize(n = n())

fantano_raw2 <- fantano_raw1 |> 
  left_join(num_in_each_genre, by = "genre")

fantano_raw1 |>
  group_by(genre) |>
  summarize(n = n()) |>
  filter(n >= 10)

# A tibble: 7 × 2
  genre          n
  <chr>      <int>
1 electronic    72
2 hip hop      398
3 loud rock    111
4 other         77
5 pop          263
6 rock         218
7 sub pop       14

# Merge genre with less than 10 songs with other genres
fantano_raw2 <- fantano_raw2 |>
  mutate(genre = ifelse(n < 10 & grepl("electronic", genre), "electronic", genre)) |>
  mutate(genre = ifelse(n < 10 & grepl("hip hop", genre), "hip hop", genre)) |>
  mutate(genre = ifelse(n < 10 & grepl("loud rock", genre), "loud rock", genre)) |>
  mutate(genre = ifelse(n < 10 & grepl("pop", genre), "pop", genre)) |>
  mutate(genre = ifelse(n < 10 & grepl("rock", genre), "rock", genre)) |>
  mutate(genre = ifelse(n < 10 & grepl("sub pop", genre), "sub pop", genre)) |>
  mutate(genre = ifelse(n < 10 & !grepl("electronic", genre) & !grepl("hip hop", genre) & !grepl("loud rock", genre) & !grepl("pop", genre) & !grepl("rock", genre) & !grepl("sub pop", genre), "other", genre))

# Put other at last since it does not provide useful information
fantano_raw2$genre <- factor(fantano_raw2$genre, levels = c("electronic", "hip hop", "loud rock", "pop", "rock", "sub pop", "other"))

fantano_cleaned <- fantano_raw2 |>
  select(-n, -name)

# Delete repetitive labels
pitchfork_raw1 <- pitchfork_raw|>
  separate_rows(label, sep = ",") |>
  group_by(album) |>
  summarise(label = paste(unique(label), collapse = ","))

pitchfork_raw2 <- pitchfork_raw|>
  select(-link, - review)|>
  # Extract date and convert to correct format
  separate(date, into = c("review_month", "review_day","review_year"), sep = " ")|>  
  mutate(
         review_month = case_when(
         review_month == "January" ~ "01",
         review_month == "February" ~ "02",
         review_month == "March" ~ "03",
         review_month == "April" ~ "04",
         review_month == "May" ~ "05",
         review_month == "June" ~ "06",
         review_month == "July" ~ "07",
         review_month == "August" ~ "08",
         review_month == "September" ~ "09",
         review_month == "October" ~ "10",
         review_month == "November" ~ "11",
         review_month == "December" ~ "12"),
         review_day = as.numeric(review_day),
         review_month = as.numeric(review_month),
         review_year = as.numeric(review_year),
         date = as.Date(with(pitchfork_raw, paste(review_year, review_month,
                                                  review_day,sep="-")), "%Y-%m-%d"),
         # Compare the release and review year
         is_retrospective = if_else(release_year == review_year, "no", "yes"))|>
         # Exclude irrelevant variables
         select(-review_month, -review_day,-label)
    
# Extract data and convert to correct format
pitchfork_cleaned <- inner_join(x = pitchfork_raw1, y = pitchfork_raw2, 
                       by = c("album"))|>
  # Delete rows with NAs
      na.omit() |>
      arrange(desc(score))|>
  # Keep the six most important tags
    separate(label, into = c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6"), sep = "\\,")

Warning: Expected 6 pieces. Additional pieces discarded in 46 rows [215, 1949,
2291, 2473, 2474, 2742, 2775, 3453, 5467, 5468, 5530, 6082, 6145, 6201, 6879,
6881, 7547, 7603, 8160, 8923, ...].

Warning: Expected 6 pieces. Missing pieces filled with `NA` in 15214 rows [1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

pitchfork_cleaned

# A tibble: 15,273 × 17
   album      tag1  tag2  tag3  tag4  tag5  tag6  artist genre score review_year
   <chr>      <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr> <dbl>       <dbl>
 1 "\"Heroes… RCA   <NA>  <NA>  <NA>  <NA>  <NA>  David… Rock     10        2016
 2 "1999"     Warn… NA    Astr… <NA>  <NA>  <NA>  Prince Pop/…    10        2016
 3 "A Love S… Verve <NA>  <NA>  <NA>  <NA>  <NA>  John … Jazz     10        2015
 4 "Abbey Ro… EMI   <NA>  <NA>  <NA>  <NA>  <NA>  The B… Rock     10        2009
 5 "Another … Isla… <NA>  <NA>  <NA>  <NA>  <NA>  Brian… Expe…    10        2016
 6 "Appetite… Geff… <NA>  <NA>  <NA>  <NA>  <NA>  Guns … Rock     10        2017
 7 "Astral W… Warn… <NA>  <NA>  <NA>  <NA>  <NA>  Van M… Rock     10        2015
 8 "Bach: Th… CBS   Colu… <NA>  <NA>  <NA>  <NA>  Glenn… Jazz     10        2017
 9 "Blood on… Colu… <NA>  <NA>  <NA>  <NA>  <NA>  Bob D… Rock     10        2016
10 "Born to … Colu… <NA>  <NA>  <NA>  <NA>  <NA>  Bruce… Rock     10        2005
# ℹ 15,263 more rows
# ℹ 6 more variables: author <chr>, role <chr>, bnm <dbl>, release_year <dbl>,
#   date <date>, is_retrospective <chr>

Data description

What are the observations (rows) and the attributes (columns)?

fantano: each row represents an album review with below attributes.

album: <char> The album in question.

artist: <char> The artist that made this album.

date: <date> Date for which this review was published.

genre: <char, categorical> Genre of the album, e.g. electronic, hip hop, etc.

score: <double> Score given by Fantano regarding this album, ranging from 1~10.

tag1-6: <char> Six main tags given to the album by Fantano, some describe the theme, other describing the genre and year the song was written.
- Pitchfork: Each row represents an album review with below attributes.
  - album: the name of the album in question.
  - tags 1-6: Up to 6 tags associated with this album.
  - artist: The artist responsible for the album.
  - genre: The genre of the album, e.g. hip-hop or rock.
  - score: The score the album was given in this review.
  - review_year: The year the review was published.
  - author: The writer of the album review.
  - role: The author’s position within Pitchfork.
  - bnm: This is 1 if the album was awarded a “bnm,” or “best new music,” and 0 otherwise. Pitchfork awards bnms to the albums they feel are the best or most essential each year.
  - release_year: The year the album in question was released.
  - date: The date the review was published.
  - is_retrospective: Is “yes” if the album was reviewed in a year after it was published, and “no” otherwise.
Why was this dataset created?

Fantano: It was scraped from his blog website by kaggle.com user Apatosaur. They do not explicit write the reason for creating the dataset.

Pitchfork: It was scraped from Pitchfork’s website by components.one user Andrew Thompson. The reason for the creation of the dataset is unknown.
Who funded the creation of the dataset?

Fantano is from a music youtuber / blogger named Anthony Fantano, who has been reviewing music since the early 2010s. It was scraped from his blog website by kaggle.com user Apatosaur. Details can be found here: https://www.kaggle.com/datasets/apat0saur/theneedledrop-reviews?resource=download

Pitchfork: Nobody funded this dataset, though Thompson does credit one Nolan Conway for the idea.
What processes might have influenced what data was observed and recorded and what was not?

Fantano: Anthony Fantano’s personal music taste may influence what songs he decided to write reviews on. He may be writing songs of genres he likes more frequently.

Pitchfork: Thompson last updated this dataset in 2021, so more recent reviews will not be included.
What preprocessing was done, and how did the data come to be in the form that you are using?

The raw data is from the kaggle.com website. To transform into clean version, we did below steps:

1. Separate date to date and time and drop the time column as it does not provide useful info;

2. Correct data format into date, numeric, etc. based on what the data represents;

3. Omit rows containing NA (there are only a few);

4. Since there are originally 67 genres which will make classification difficult, so merged it into 7 genres by merging, for example, “conscious hip hop” to “hip hop”, and create factor for genre
- Pitchfork: For the pitchfork dataset, changes are made to the label and date. Repetitive labels are deleted and the column was separated into six with one tag in each cell. The “date” column is converted from string to date format. A separate column is created for the review year to compare it with the review year, and a new column that shows whether the review is retrospective is added. Unrelated variables - “link” and “review” - are removed. After tidying the dataset, pitchfork can be joined with fantano without conflicts in variable format.
If people are involved, were they aware of the data collection and if so, what purpose did they expect the data to be used for?

Fantano: Only artist names are involved; they may be expect their name to appear along with the songs for review, since publication of a song means it’s going to go public and face some judgement.

Pitchfork: To my knowledge the authors and artist are not aware that this data has been collected. Given that it is all displayed professionally for mass consumption on a website, however, and involves no personal data, I doubt they would mind using these reviews as analysis.

Data limitations

For fantano_cleaned, since it’s from a music youtuber / blogger named Anthony Fantano who has been reviewing music since the early 2010s, it may not be representative of public taste of US people: every rows of review is wrote by the same person Anthony Fantano. On the other hand, pitchfork is a very famous music publication, so it is more representative of public opinions. Also, pitchfork and fantano don’t review exactly the same albums, so It will be meaningless to directly combine them; but it would be useful to make a contrast between them on common albums they have.

Another limitation is that fantano_cleaned mainly focused on modern music (the earliest data is 2013-03-26), so does not provide a comprehensive view of music taste earlier than 2013. With that said, it is sufficient for our use since we are interested in investigating the music taste changes in the recent 10 years.

Additionally, during exploratory data analysis, we found it difficult to analyze non-numeric columns such as “review” in pitchfork and “album” and “artist” in both data frames. Because R Studio is not good at analyzing string patterns, our analysis may be based on categorical and numeric variables for most of the parts.

fantano_cleaned |>
  arrange(date)

# A tibble: 1,267 × 11
   album       artist date       genre score tag1  tag2  tag3  tag4  tag5  tag6 
   <chr>       <chr>  <date>     <fct> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
 1 The 20/20 … Justi… 2013-03-26 pop      10 "201… " da… " ju… " mi… " mu… " po…
 2 A Special … Open … 2015-02-06 hip …     8 "201… " a … " ep… " hi… " op… " ra…
 3 Xe          Zs     2015-02-06 rock      8 "201… " al… " dr… " ex… " fr… " ja…
 4 Lost Themes John … 2015-02-06 elec…     7 "alb… " ho… " jo… " lo… " pr… " RE…
 5 To Pimp A … Kendr… 2015-06-20 hip …    10 "ken… " to… " 20… " 10… " al… " to…
 6 The Epic    Kamas… 2015-06-20 other     9 "kam… " th… " al… " 20… " 9/… " po…
 7 The Powers… Death… 2015-06-20 hip …     9 "dea… " th… " ni… " je… " th… " al…
 8 Citizen Zo… The P… 2015-06-20 pop       9 "uk " " th… " ci… " 9/… " al… " 20…
 9 Painted Sh… Hop A… 2015-06-20 rock      8 "hop… " pa… " al… " 20… " sa… " ro…
10 Frozen Nia… Pruri… 2015-06-20 other     8 "pru… " fr… " al… " 20… " pr… " ot…
# ℹ 1,257 more rows

Exploratory data analysis

Perform an (initial) exploratory data analysis.

fantano_cleaned

# A tibble: 1,267 × 11
   album       artist date       genre score tag1  tag2  tag3  tag4  tag5  tag6 
   <chr>       <chr>  <date>     <fct> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Spare Ribs  Sleaf… 2021-03-19 hip …    10 "sle… " sp… " 20… " al… " uk… " ro…
 2 You Won't … Daugh… 2018-11-01 loud…    10 "dau… " yo… " 20… " ip… " lo… " no…
 3 KIDS SEE G… Kids … 2018-06-11 hip …    10 "kid… " se… " 20… " g.… " ka… " ki…
 4 To Pimp A … Kendr… 2015-06-20 hip …    10 "ken… " to… " 20… " 10… " al… " to…
 5 The 20/20 … Justi… 2013-03-26 pop      10 "201… " da… " ju… " mi… " mu… " po…
 6 Daddy's Ho… St. V… 2021-05-18 pop       9 "st.… " da… " 20… " al… " lo… " po…
 7 Promises    Float… 2021-04-12 other     9 "flo… " ph… " lo… " ls… " al… " 20…
 8 Plastic Be… Goril… 2021-01-12 pop       9 "gor… " pl… " al… " 20… " 20… " re…
 9 Windswept … Ichik… 2020-12-09 other     9 "ich… " wi… " ad… " al… " 20… " he…
10 Visions of… clipp… 2020-10-28 sub …     9 "cli… " vi… " 20… " al… " su… " ra…
# ℹ 1,257 more rows

fantano_cleaned |>
  group_by(genre) |>
  summarize(score = score,
    album_number = n()) |>
  ggplot(aes(x = score, y = genre)) +
  geom_boxplot() +
  labs(
    title = "Fantano Album Scores by Genre",
    x = "Score (1 - 10)",
    y = "Genre"
  ) +
  theme_minimal()

Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.

`summarise()` has grouped output by 'genre'. You can override using the
`.groups` argument.

year_score_sum <- pitchfork_cleaned |> 
  group_by(release_year) |>
  summarize(avgscore = mean(score))
year_score_sum |> 
  ggplot(aes(x = release_year, y = avgscore)) +
  geom_line() + 
  theme_minimal() + 
  geom_vline(xintercept = 1997) +
  labs(
    title = "Average album rating based on album release year",
    subtitle = "Vertical line when pitchfork starts releasing reviews",
    x = "Year of album release",
    y = "Average score among albums released that year"
  )

year_score_sum_m <- pitchfork_cleaned |> 
  group_by(release_year, is_retrospective) |>
  summarize(avgscore = mean(score))

`summarise()` has grouped output by 'release_year'. You can override using the
`.groups` argument.

year_score_sum_m

# A tibble: 80 × 3
# Groups:   release_year [60]
   release_year is_retrospective avgscore
          <dbl> <chr>               <dbl>
 1         1957 yes                  8.95
 2         1960 yes                  9   
 3         1962 yes                  8.2 
 4         1963 yes                  9.43
 5         1964 yes                  9.34
 6         1965 yes                  8.96
 7         1966 yes                  8.52
 8         1967 yes                  8.68
 9         1968 yes                  8.39
10         1969 yes                  8.78
# ℹ 70 more rows

year_score_sum_m |> 
  ggplot(aes(x = release_year, y = avgscore, color = is_retrospective, group = is_retrospective)) +
  geom_line() + 
  theme_minimal() + 
  xlim(1996, 2020) +
  labs(
    title = "Average album rating based on album release year",
    subtitle = "Comparing retrospective and modern reviews",
    x = "Year of album release",
    y = "Average score among albums released that year",
    color = "Is the review retrospective?"
  ) +
  theme(
    legend.position = "bottom", 
    legend.direction = "horizontal"
  )

Warning: Removed 36 rows containing missing values (`geom_line()`).

fantano_new <- fantano_cleaned |>
  mutate(year = year(date)) |>
  select(genre, score, year) |>
  group_by(genre, year) |>
  filter(genre == "hip hop") |>
  group_by(year) |>
  summarize(average_score = mean(score)) |>
  filter(average_score != "NA")

ggplot(fantano_new, mapping = aes(x = year, y = average_score)) +
  geom_point() +
  geom_line() +
  labs (
    title = "Average Score of Hip Hop Album per Year",
    x = "Year",
    y = "Average Score",
  ) +
  theme_minimal()

Questions for reviewers

List specific questions for your peer reviewers and project mentor to answer in giving you feedback on this phase.

Are there any other relationships that you would like to see us analyze?
Are there any improvements that we can make to our data sets or graphs?
Are there any columns in our datasets that you don’t think we’ll need?

pitchfork_2genre <- pitchfork_cleaned |>
  filter(genre %in% c("Folk/Country", "Rock")) |>
  select(genre, score, release_year) |>
  na.omit()

pitchfork_summary <- pitchfork_2genre |>
  group_by(genre, release_year) |>
  summarize(n = n(), mean_score = mean(score))

`summarise()` has grouped output by 'genre'. You can override using the
`.groups` argument.

pitchfork_summary |> 
  ggplot(aes(x = release_year, y = mean_score, color = genre)) +
  geom_line() + 
  theme_minimal() + 
  labs(
    title = "Average album rating based on album release year",
    subtitle = "Vertical line when pitchfork starts releasing reviews",
    x = "Year of album release",
    y = "Average score among albums released that year"
  ) +
  facet_wrap(~genre)