# import packages
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0 ✔ purrr 1.0.0
✔ tibble 3.2.1 ✔ dplyr 1.1.2
✔ tidyr 1.2.1 ✔ stringr 1.5.0
✔ readr 2.1.3 ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom 1.0.2 ✔ rsample 1.1.1
✔ dials 1.1.0 ✔ tune 1.1.1
✔ infer 1.0.4 ✔ workflows 1.1.2
✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
✔ parsnip 1.0.3 ✔ yardstick 1.1.0
✔ recipes 1.0.6
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(skimr)
# import data
<- read_csv(file.path("data", "tiktok.csv")) tiktok
Rows: 6746 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): track_id, track_name, artist_id, artist_name, album_id, release_da...
dbl (14): duration, popularity, danceability, energy, key, loudness, mode, s...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# view data, check data types are valid
glimpse(tiktok)
Rows: 6,746
Columns: 23
$ track_id <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ track_name <chr> "Lay It Down Gmix - Main", "Bartender (feat. Akon)", …
$ artist_id <chr> "1Xfmvd48oOhEWkscWyEbh9", "3aQeKQSyrW4qWr35idm0cy", "…
$ artist_name <chr> "Lloyd", "T-Pain", "T-Pain", "Blxst", "Gryffin", "Bel…
$ album_id <chr> "43C6GVlhXG4KfZuEbxty3r", "6CrSEKCF4TYrbSIitegb3h", "…
$ duration <dbl> 302186, 238800, 238800, 161684, 218295, 122772, 12277…
$ release_date <chr> "2011-01-01", "2007-06-05", "2007-06-05", "2020-12-04…
$ popularity <dbl> 28, 75, 75, 76, 72, 89, 89, 50, 89, 70, 70, 98, 98, 4…
$ danceability <dbl> 0.597, 0.832, 0.832, 0.571, 0.548, 0.855, 0.855, 0.77…
$ energy <dbl> 0.800, 0.391, 0.391, 0.767, 0.839, 0.463, 0.463, 0.80…
$ key <dbl> 1, 8, 8, 2, 6, 3, 3, 11, 4, 1, 1, 8, 8, 1, 3, 0, 11, …
$ loudness <dbl> -5.423, -8.504, -8.504, -5.160, -2.371, -7.454, -7.45…
$ mode <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
$ speechiness <dbl> 0.3120, 0.0628, 0.0628, 0.2870, 0.0644, 0.0367, 0.036…
$ acousticness <dbl> 0.04610, 0.05640, 0.05640, 0.33600, 0.13500, 0.21700,…
$ instrumentalness <dbl> 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.7…
$ liveness <dbl> 0.1800, 0.2240, 0.2240, 0.0809, 0.1020, 0.3470, 0.347…
$ valence <dbl> 0.565, 0.436, 0.436, 0.605, 0.314, 0.866, 0.866, 0.90…
$ tempo <dbl> 155.932, 104.961, 104.961, 93.421, 98.932, 102.931, 1…
$ playlist_id <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ playlist_name <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ duration_mins <dbl> 5.036433, 3.980000, 3.980000, 2.694733, 3.638250, 2.0…
$ genre <chr> "TIKTOK DANCE", "TIKTOK DANCE", "TIKTOK DANCE", "TIKT…
# remove null and NaN values, if any
<- na.omit(tiktok)
tiktok_clean
# drop unnecessary columns: playlist_id, playlist_name, genre
<- select(tiktok_clean, -playlist_id, -playlist_name, -genre)
tiktok_clean
# checked column names, none necessary for renaming
# note: keys go from 0 to 11 (C, C#, D, D#, E, F,…)
tiktok_clean
# A tibble: 6,746 × 20
track_id track_name artist_id artist_name album_id duration release_date
<chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 6kVuF2PYLuvl… Lay It Do… 1Xfmvd48… Lloyd 43C6GVl… 302186 2011-01-01
2 1RGIjMFMgJxk… Bartender… 3aQeKQSy… T-Pain 6CrSEKC… 238800 2007-06-05
3 1RGIjMFMgJxk… Bartender… 3aQeKQSy… T-Pain 6CrSEKC… 238800 2007-06-05
4 1dIWPXMX4kRH… Chosen (f… 4qXC0i02… Blxst 7Awrgen… 161684 2020-12-04
5 4QVS8YCpK71R… Tie Me Do… 2ZRQcIgz… Gryffin 69t8rpg… 218295 2018-08-03
6 7BoobGhD4x5K… Build a B… 26cMerAx… Bella Poar… 5YKqfiQ… 122772 2021-05-14
7 7BoobGhD4x5K… Build a B… 26cMerAx… Bella Poar… 5YKqfiQ… 122772 2021-05-14
8 5OKHUpNLi4GE… Ever After 1mCY2mHc… Bonnie Bai… 4TrCexU… 231559 2018-05-11
9 3J8EOeKLTLXO… Calling M… 6jGMq4yG… Lil Tjay 3MEKpJ7… 205458 2021-04-02
10 5caZgotE4D6e… Clap For … 03T8GHHc… YungManny 7nYMFoZ… 125579 2021-03-20
# ℹ 6,736 more rows
# ℹ 13 more variables: popularity <dbl>, danceability <dbl>, energy <dbl>,
# key <dbl>, loudness <dbl>, mode <dbl>, speechiness <dbl>,
# acousticness <dbl>, instrumentalness <dbl>, liveness <dbl>, valence <dbl>,
# tempo <dbl>, duration_mins <dbl>