# import packages
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0 ✔ purrr 1.0.0
✔ tibble 3.2.1 ✔ dplyr 1.1.2
✔ tidyr 1.2.1 ✔ stringr 1.5.0
✔ readr 2.1.3 ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom 1.0.2 ✔ rsample 1.1.1
✔ dials 1.1.0 ✔ tune 1.1.1
✔ infer 1.0.4 ✔ workflows 1.1.2
✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
✔ parsnip 1.0.3 ✔ yardstick 1.1.0
✔ recipes 1.0.6
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(skimr)
# import data
<- read_csv(file.path("data", "tiktok.csv")) tiktok
Rows: 6746 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): track_id, track_name, artist_id, artist_name, album_id, release_da...
dbl (14): duration, popularity, danceability, energy, key, loudness, mode, s...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# view data, check data types are valid
glimpse(tiktok)
Rows: 6,746
Columns: 23
$ track_id <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ track_name <chr> "Lay It Down Gmix - Main", "Bartender (feat. Akon)", …
$ artist_id <chr> "1Xfmvd48oOhEWkscWyEbh9", "3aQeKQSyrW4qWr35idm0cy", "…
$ artist_name <chr> "Lloyd", "T-Pain", "T-Pain", "Blxst", "Gryffin", "Bel…
$ album_id <chr> "43C6GVlhXG4KfZuEbxty3r", "6CrSEKCF4TYrbSIitegb3h", "…
$ duration <dbl> 302186, 238800, 238800, 161684, 218295, 122772, 12277…
$ release_date <chr> "2011-01-01", "2007-06-05", "2007-06-05", "2020-12-04…
$ popularity <dbl> 28, 75, 75, 76, 72, 89, 89, 50, 89, 70, 70, 98, 98, 4…
$ danceability <dbl> 0.597, 0.832, 0.832, 0.571, 0.548, 0.855, 0.855, 0.77…
$ energy <dbl> 0.800, 0.391, 0.391, 0.767, 0.839, 0.463, 0.463, 0.80…
$ key <dbl> 1, 8, 8, 2, 6, 3, 3, 11, 4, 1, 1, 8, 8, 1, 3, 0, 11, …
$ loudness <dbl> -5.423, -8.504, -8.504, -5.160, -2.371, -7.454, -7.45…
$ mode <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
$ speechiness <dbl> 0.3120, 0.0628, 0.0628, 0.2870, 0.0644, 0.0367, 0.036…
$ acousticness <dbl> 0.04610, 0.05640, 0.05640, 0.33600, 0.13500, 0.21700,…
$ instrumentalness <dbl> 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.7…
$ liveness <dbl> 0.1800, 0.2240, 0.2240, 0.0809, 0.1020, 0.3470, 0.347…
$ valence <dbl> 0.565, 0.436, 0.436, 0.605, 0.314, 0.866, 0.866, 0.90…
$ tempo <dbl> 155.932, 104.961, 104.961, 93.421, 98.932, 102.931, 1…
$ playlist_id <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ playlist_name <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ duration_mins <dbl> 5.036433, 3.980000, 3.980000, 2.694733, 3.638250, 2.0…
$ genre <chr> "TIKTOK DANCE", "TIKTOK DANCE", "TIKTOK DANCE", "TIKT…
# remove null and NaN values, if any
<- na.omit(tiktok)
tiktok_clean
# drop unnecessary columns and checked column names for renaming
|>
tiktok_clean select(duration_mins, popularity, danceability, release_date) |>
mutate(date = as.Date(release_date)) |> # format to type date
drop_na(date) |>
mutate(year = as.numeric(format(tiktok_clean$date,'%Y')))
Warning: There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `year = as.numeric(format(tiktok_clean$date, "%Y"))`.
Caused by warning:
! Unknown or uninitialised column: `date`.
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# A tibble: 6,644 × 6
duration_mins popularity danceability release_date date year
<dbl> <dbl> <dbl> <chr> <date> <dbl>
1 5.04 28 0.597 2011-01-01 2011-01-01 NA
2 3.98 75 0.832 2007-06-05 2007-06-05 NA
3 3.98 75 0.832 2007-06-05 2007-06-05 NA
4 2.69 76 0.571 2020-12-04 2020-12-04 NA
5 3.64 72 0.548 2018-08-03 2018-08-03 NA
6 2.05 89 0.855 2021-05-14 2021-05-14 NA
7 2.05 89 0.855 2021-05-14 2021-05-14 NA
8 3.86 50 0.774 2018-05-11 2018-05-11 NA
9 3.42 89 0.907 2021-04-02 2021-04-02 NA
10 2.09 70 0.903 2021-03-20 2021-03-20 NA
# ℹ 6,634 more rows