# import packages
library(tidyverse)── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.0
✔ tibble  3.2.1     ✔ dplyr   1.1.2
✔ tidyr   1.2.1     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()library(tidymodels)── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.2     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.1.1
✔ infer        1.0.4     ✔ workflows    1.1.2
✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
✔ parsnip      1.0.3     ✔ yardstick    1.1.0
✔ recipes      1.0.6     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.library(skimr)
# import data
tiktok <- read_csv(file.path("data", "tiktok.csv"))Rows: 6746 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): track_id, track_name, artist_id, artist_name, album_id, release_da...
dbl (14): duration, popularity, danceability, energy, key, loudness, mode, s...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.# view data, check data types are valid
glimpse(tiktok)Rows: 6,746
Columns: 23
$ track_id         <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ track_name       <chr> "Lay It Down Gmix - Main", "Bartender (feat. Akon)", …
$ artist_id        <chr> "1Xfmvd48oOhEWkscWyEbh9", "3aQeKQSyrW4qWr35idm0cy", "…
$ artist_name      <chr> "Lloyd", "T-Pain", "T-Pain", "Blxst", "Gryffin", "Bel…
$ album_id         <chr> "43C6GVlhXG4KfZuEbxty3r", "6CrSEKCF4TYrbSIitegb3h", "…
$ duration         <dbl> 302186, 238800, 238800, 161684, 218295, 122772, 12277…
$ release_date     <chr> "2011-01-01", "2007-06-05", "2007-06-05", "2020-12-04…
$ popularity       <dbl> 28, 75, 75, 76, 72, 89, 89, 50, 89, 70, 70, 98, 98, 4…
$ danceability     <dbl> 0.597, 0.832, 0.832, 0.571, 0.548, 0.855, 0.855, 0.77…
$ energy           <dbl> 0.800, 0.391, 0.391, 0.767, 0.839, 0.463, 0.463, 0.80…
$ key              <dbl> 1, 8, 8, 2, 6, 3, 3, 11, 4, 1, 1, 8, 8, 1, 3, 0, 11, …
$ loudness         <dbl> -5.423, -8.504, -8.504, -5.160, -2.371, -7.454, -7.45…
$ mode             <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
$ speechiness      <dbl> 0.3120, 0.0628, 0.0628, 0.2870, 0.0644, 0.0367, 0.036…
$ acousticness     <dbl> 0.04610, 0.05640, 0.05640, 0.33600, 0.13500, 0.21700,…
$ instrumentalness <dbl> 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.7…
$ liveness         <dbl> 0.1800, 0.2240, 0.2240, 0.0809, 0.1020, 0.3470, 0.347…
$ valence          <dbl> 0.565, 0.436, 0.436, 0.605, 0.314, 0.866, 0.866, 0.90…
$ tempo            <dbl> 155.932, 104.961, 104.961, 93.421, 98.932, 102.931, 1…
$ playlist_id      <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ playlist_name    <chr> "6kVuF2PYLuvl9T85XjNbaO", "1RGIjMFMgJxkZHMDXVYzOJ", "…
$ duration_mins    <dbl> 5.036433, 3.980000, 3.980000, 2.694733, 3.638250, 2.0…
$ genre            <chr> "TIKTOK DANCE", "TIKTOK DANCE", "TIKTOK DANCE", "TIKT…# remove null and NaN values, if any
tiktok_clean <- na.omit(tiktok)
# drop unnecessary columns: playlist_id, playlist_name, genre
tiktok_clean <- select(tiktok_clean, -playlist_id, -playlist_name, -genre)
# checked column names, none necessary for renaming
# note: keys go from 0 to 11 (C, C#, D, D#, E, F,…)
tiktok_clean# A tibble: 6,746 × 20
   track_id      track_name artist_id artist_name album_id duration release_date
   <chr>         <chr>      <chr>     <chr>       <chr>       <dbl> <chr>       
 1 6kVuF2PYLuvl… Lay It Do… 1Xfmvd48… Lloyd       43C6GVl…   302186 2011-01-01  
 2 1RGIjMFMgJxk… Bartender… 3aQeKQSy… T-Pain      6CrSEKC…   238800 2007-06-05  
 3 1RGIjMFMgJxk… Bartender… 3aQeKQSy… T-Pain      6CrSEKC…   238800 2007-06-05  
 4 1dIWPXMX4kRH… Chosen (f… 4qXC0i02… Blxst       7Awrgen…   161684 2020-12-04  
 5 4QVS8YCpK71R… Tie Me Do… 2ZRQcIgz… Gryffin     69t8rpg…   218295 2018-08-03  
 6 7BoobGhD4x5K… Build a B… 26cMerAx… Bella Poar… 5YKqfiQ…   122772 2021-05-14  
 7 7BoobGhD4x5K… Build a B… 26cMerAx… Bella Poar… 5YKqfiQ…   122772 2021-05-14  
 8 5OKHUpNLi4GE… Ever After 1mCY2mHc… Bonnie Bai… 4TrCexU…   231559 2018-05-11  
 9 3J8EOeKLTLXO… Calling M… 6jGMq4yG… Lil Tjay    3MEKpJ7…   205458 2021-04-02  
10 5caZgotE4D6e… Clap For … 03T8GHHc… YungManny   7nYMFoZ…   125579 2021-03-20  
# ℹ 6,736 more rows
# ℹ 13 more variables: popularity <dbl>, danceability <dbl>, energy <dbl>,
#   key <dbl>, loudness <dbl>, mode <dbl>, speechiness <dbl>,
#   acousticness <dbl>, instrumentalness <dbl>, liveness <dbl>, valence <dbl>,
#   tempo <dbl>, duration_mins <dbl>

