library(tidyverse)
library(skimr)
Data Analysis of Severe Crimes Committed in New York City
Appendix to report
Data cleaning
Rows: 189774 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): ARREST_DATE, PD_DESC, OFNS_DESC, LAW_CODE, LAW_CAT_CD, ARREST_BORO...
dbl (9): ARREST_KEY, PD_CD, KY_CD, ARREST_PRECINCT, JURISDICTION_CODE, X_CO...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(drop_cols)
# Now:
data %>% select(all_of(drop_cols))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# A tibble: 186,784 × 17
arrest_month arrest_day arrest_year offense_description level_of_offense
<int> <int> <int> <chr> <chr>
1 1 23 2022 JOSTLING M
2 1 31 2022 ROBBERY F
3 2 1 2022 FELONY ASSAULT F
4 2 13 2022 ASSAULT 3 & RELATED OFF… M
5 2 21 2022 ROBBERY F
6 3 14 2022 FELONY ASSAULT F
7 3 22 2022 FELONY ASSAULT F
8 3 29 2022 OFFENSES INVOLVING FRAUD M
9 4 5 2022 RAPE F
10 5 4 2022 FELONY ASSAULT F
# ℹ 186,774 more rows
# ℹ 12 more variables: arrest_boro <chr>, arrest_precinct <dbl>,
# jurisdiction_code <dbl>, age_group <fct>, gender <chr>, race <chr>,
# x_coord_cd <dbl>, y_coord_cd <dbl>, latitude <dbl>, longitude <dbl>,
# severe_crime <fct>, arrest_borough <chr>
# A tibble: 186,784 × 17
arrest_month arrest_day arrest_year offense_description level_of_offense
<int> <int> <int> <chr> <chr>
1 1 23 2022 JOSTLING M
2 1 31 2022 ROBBERY F
3 2 1 2022 FELONY ASSAULT F
4 2 13 2022 ASSAULT 3 & RELATED OFF… M
5 2 21 2022 ROBBERY F
6 3 14 2022 FELONY ASSAULT F
7 3 22 2022 FELONY ASSAULT F
8 3 29 2022 OFFENSES INVOLVING FRAUD M
9 4 5 2022 RAPE F
10 5 4 2022 FELONY ASSAULT F
# ℹ 186,774 more rows
# ℹ 12 more variables: arrest_boro <chr>, arrest_precinct <dbl>,
# jurisdiction_code <dbl>, age_group <fct>, gender <chr>, race <chr>,
# x_coord_cd <dbl>, y_coord_cd <dbl>, latitude <dbl>, longitude <dbl>,
# severe_crime <fct>, arrest_borough <chr>
EDA
|>
arrest_clean summarize(
avg_arrest_month = mean(arrest_month),
sd_arrest_month = sd(arrest_month),
med_arrest_month = median(arrest_month)
)
# A tibble: 1 × 3
avg_arrest_month sd_arrest_month med_arrest_month
<dbl> <dbl> <dbl>
1 6.62 3.39 7
|>
arrest_clean ggplot(
mapping = aes(x = arrest_month)
+
) geom_bar()
|>
arrest_clean summarize(
avg_arrest_day = mean(arrest_day),
sd_arrest_day = sd(arrest_day),
med_arrest_day = median(arrest_day)
)
# A tibble: 1 × 3
avg_arrest_day sd_arrest_day med_arrest_day
<dbl> <dbl> <dbl>
1 15.5 8.75 15
|>
arrest_clean ggplot(
mapping = aes(x = arrest_day)
+
) geom_bar()
<- arrest_clean |>
offense_descr_count group_by(offense_description) |>
count(offense_description)
offense_descr_count
# A tibble: 64 × 2
# Groups: offense_description [64]
offense_description n
<chr> <int>
1 ADMINISTRATIVE CODE 126
2 ADMINISTRATIVE CODES 1
3 AGRICULTURE & MRKTS LAW-UNCLASSIFIED 73
4 ALCOHOLIC BEVERAGE CONTROL LAW 146
5 ANTICIPATORY OFFENSES 20
6 ARSON 144
7 ASSAULT 3 & RELATED OFFENSES 30582
8 BURGLAR'S TOOLS 568
9 BURGLARY 6231
10 CANNABIS RELATED OFFENSES 125
# ℹ 54 more rows
<- arrest_clean |>
offense_descr_level_count group_by(offense_description, level_of_offense) |>
count(offense_description)
offense_descr_level_count
# A tibble: 86 × 3
# Groups: offense_description, level_of_offense [86]
offense_description level_of_offense n
<chr> <chr> <int>
1 ADMINISTRATIVE CODE I 7
2 ADMINISTRATIVE CODE M 34
3 ADMINISTRATIVE CODE V 85
4 ADMINISTRATIVE CODES V 1
5 AGRICULTURE & MRKTS LAW-UNCLASSIFIED M 73
6 ALCOHOLIC BEVERAGE CONTROL LAW M 146
7 ANTICIPATORY OFFENSES M 20
8 ARSON F 144
9 ASSAULT 3 & RELATED OFFENSES M 30582
10 BURGLAR'S TOOLS M 568
# ℹ 76 more rows
|>
arrest_clean summarize(
avg_longitude = mean(longitude),
med_longitude = median(longitude),
sd_longitude = sd(longitude),
avg_latitude = mean(latitude),
med_latitude = median(latitude),
sd_latitude = sd(latitude),
)
# A tibble: 1 × 6
avg_longitude med_longitude sd_longitude avg_latitude med_latitude sd_latitude
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -73.9 -73.9 0.0762 40.7 40.7 0.0814
|>
arrest_clean ggplot(aes(x = longitude, y = latitude)) +
geom_point(size = .25, show.legend = FALSE) +
coord_quickmap()
|>
arrest_clean group_by(offense_description, level_of_offense, age_group) |>
count(offense_description)
# A tibble: 344 × 4
# Groups: offense_description, level_of_offense, age_group [344]
offense_description level_of_offense age_group n
<chr> <chr> <fct> <int>
1 ADMINISTRATIVE CODE I 18-24 1
2 ADMINISTRATIVE CODE I 25-44 3
3 ADMINISTRATIVE CODE I 45-64 2
4 ADMINISTRATIVE CODE I 65+ 1
5 ADMINISTRATIVE CODE M 18-24 1
6 ADMINISTRATIVE CODE M 25-44 18
7 ADMINISTRATIVE CODE M 45-64 13
8 ADMINISTRATIVE CODE M 65+ 2
9 ADMINISTRATIVE CODE V 18-24 10
10 ADMINISTRATIVE CODE V 25-44 50
# ℹ 334 more rows
#Number of offenses committed by each age group
<- arrest_clean |>
age_counts group_by(age_group) |>
summarise(num_offenses = n())
age_counts
# A tibble: 5 × 2
age_group num_offenses
<fct> <int>
1 <18 6770
2 18-24 32705
3 25-44 107415
4 45-64 37056
5 65+ 2838
# Bar graph of age group vs. number of offenses
ggplot(age_counts, aes(x = age_group, y = num_offenses)) +
geom_bar(stat = "identity") +
labs(
x = "Age group",
y = "Number of offenses",
title = "Number of offenses by Age Group"
)
#Number of offenses committed by each age group-race
<- arrest_clean |>
age_counts group_by(age_group, race) |>
summarise(num_offenses = n())
`summarise()` has grouped output by 'age_group'. You can override using the
`.groups` argument.
age_counts
# A tibble: 30 × 3
# Groups: age_group [5]
age_group race num_offenses
<fct> <chr> <int>
1 <18 AMERICAN INDIAN/ALASKAN NATIVE 15
2 <18 ASIAN / PACIFIC ISLANDER 266
3 <18 BLACK 4091
4 <18 BLACK HISPANIC 735
5 <18 WHITE 305
6 <18 WHITE HISPANIC 1358
7 18-24 AMERICAN INDIAN/ALASKAN NATIVE 102
8 18-24 ASIAN / PACIFIC ISLANDER 1560
9 18-24 BLACK 16992
10 18-24 BLACK HISPANIC 3592
# ℹ 20 more rows
# Bar graph of age group vs. number of offenses by race
ggplot(age_counts, aes(x = age_group, y = num_offenses)) +
geom_bar(stat = "identity") +
labs(
x = "Age group",
y = "Number of offenses",
title = "Number of offenses by age group and race"
+
) facet_wrap(vars(race))
<- arrest_clean |>
race_count group_by(race) |>
count(race) |>
mutate(percentage = 100*n/189774)
race_count
# A tibble: 6 × 3
# Groups: race [6]
race n percentage
<chr> <int> <dbl>
1 AMERICAN INDIAN/ALASKAN NATIVE 516 0.272
2 ASIAN / PACIFIC ISLANDER 10085 5.31
3 BLACK 93229 49.1
4 BLACK HISPANIC 17139 9.03
5 WHITE 19158 10.1
6 WHITE HISPANIC 46657 24.6
<- arrest_clean |>
borough_counts group_by(arrest_boro) |>
count()
borough_counts
# A tibble: 5 × 2
# Groups: arrest_boro [5]
arrest_boro n
<chr> <int>
1 B 43519
2 K 50531
3 M 45945
4 Q 38511
5 S 8278
# plot a bar graph of borough vs. number of offenses
ggplot(borough_counts, aes(x = arrest_boro, y = n)) +
geom_bar(stat = "identity") +
labs(
x = "Borough",
y = "Number of offences",
title = "Offenses by Borough"
)
Spatial Data Analysis
= seq(40.49939, 40.91296, by = 0.41357/28.530)
latitudes = seq(-74.25422, -73.70072, by = 0.5535/28.530)
longitudes <- arrest_clean |>
arrest_grouped mutate(
grid_cell = cut(
$latitude,
arrest_cleanbreaks = latitudes, labels = FALSE
+
) cut(
($longitude,
arrest_cleanbreaks = longitudes,
labels = FALSE) - 1) * length(latitudes),
grid_cell = factor(grid_cell)
|>
) group_by(grid_cell) |>
filter(severe_crime == "Yes")|>
summarize(
avg_lat = mean(latitude),
avg_long = mean(longitude),
min_lat = min(latitude),
max_lat = max(latitude),
min_long = min(longitude),
max_long = max(longitude),
num_severe_crimes = n(),
common_boro = names(sort(-table(arrest_borough)))[1]
)
|>
arrest_grouped head(5)
# A tibble: 5 × 9
grid_cell avg_lat avg_long min_lat max_lat min_long max_long num_severe_crimes
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 40.5 -74.2 40.5 40.5 -74.3 -74.2 119
2 2 40.5 -74.2 40.5 40.5 -74.2 -74.2 13
3 3 40.5 -74.2 40.5 40.5 -74.2 -74.2 3
4 30 40.5 -74.2 40.5 40.5 -74.2 -74.2 6
5 31 40.5 -74.2 40.5 40.5 -74.2 -74.2 12
# ℹ 1 more variable: common_boro <chr>
Summary Statistics by Grid
|>
arrest_grouped ggplot(
aes(
x = avg_long,
y = avg_lat,
size = num_severe_crimes,
color = common_boro)) +
geom_point(alpha = 0.5) +
scale_size(guide = "none") +
coord_quickmap() +
labs(
x = "Longitude(°)",
y = "Latitude(°)",
title = "Severe Crimes in New York City",
color = "Borough"
+
) theme_minimal() +
theme(
plot.title = element_text(size = 20)
)