FBI Crime Data Analysis

Author

Trusting Koala
Yuqing Sun(ys2434),Maria Chang (mc2995), Tianyi Wang (tw324)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: 'scales'


The following object is masked from 'package:purrr':

    discard


The following object is masked from 'package:readr':

    col_factor



Attaching package: 'maps'


The following object is masked from 'package:purrr':

    map

# Data cleaning and structuring
agencies_cleaned <- agencies %>%
  mutate(
    # Standardize agency type
    agency_type = str_to_title(str_trim(replace_na(agency_type, "Unknown"))),
    # Convert dates
    nibrs_start_date = as.Date(nibrs_start_date),
    nibrs_start_year = year(nibrs_start_date),
    # Boolean indicator
    is_nibrs = case_when(
      is_nibrs %in%
        c(TRUE, "TRUE", "True", "T", 1, "1", "Yes", "YES", "Y") ~ TRUE,
      is_nibrs %in%
        c(FALSE, "FALSE", "False", "F", 0, "0", "No", "NO", "N") ~ FALSE,
      TRUE ~ NA
    )
  )
ori_dupes <- agencies_cleaned %>%
  count(ori, sort = TRUE) %>%
  filter(n > 1)
ori_dupes

[1] ori n  
<0 rows> (or 0-length row.names)

# Define political leaning (2020 Election)
dem_states <- c(
  'AZ',
  'CA',
  'CO',
  'CT',
  'DE',
  'DC',
  'GA',
  'HI',
  'IL',
  'ME',
  'MD',
  'MA',
  'MI',
  'MN',
  'NV',
  'NH',
  'NJ',
  'NM',
  'NY',
  'OR',
  'PA',
  'RI',
  'VT',
  'VA',
  'WA',
  'WI'
)
rep_states <- c(
  'AL',
  'AK',
  'AR',
  'FL',
  'ID',
  'IN',
  'IA',
  'KS',
  'KY',
  'LA',
  'MS',
  'MO',
  'MT',
  'NE',
  'NC',
  'ND',
  'OH',
  'OK',
  'SC',
  'SD',
  'TN',
  'TX',
  'UT',
  'WV',
  'WY'
)

agencies_cleaned <- agencies_cleaned %>%
  mutate(
    state_political_leaning = case_when(
      state_abbr %in% dem_states ~ "Democratic",
      state_abbr %in% rep_states ~ "Republican",
      TRUE ~ "Other"
    )
  )

setdiff(unique(na.omit(agencies_cleaned$state_abbr)), c(dem_states, rep_states))

character(0)

# Q1 metrics

agencies_q1 <- agencies_cleaned %>%
  filter(!is.na(state_abbr)) # avoid NA-state groups

# State baseline
state_baseline <- agencies_cleaned %>%
  group_by(state_abbr) %>%
  summarise(
    state_overall_participation_rate = mean(is_nibrs, na.rm = TRUE),
    .groups = "drop"
  )

# Agency type metrics per state
q1_metrics <- agencies_q1 %>%
  group_by(state_abbr, state, state_political_leaning, agency_type) %>%
  summarise(
    n_agencies = n(),
    n_nibrs = sum(is_nibrs, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(pct_nibrs = n_nibrs / n_agencies) %>%
  left_join(state_baseline, by = "state_abbr") %>%
  mutate(participation_gap = pct_nibrs - state_overall_participation_rate) %>%
  # Flag or filter small groups (threshold < 5 as per proposal)
  mutate(is_reliable = n_agencies >= 5)

state_political_summary <- agencies_q1 %>%
  group_by(state_political_leaning) %>%
  summarise(
    participation_rate = mean(is_nibrs, na.rm = TRUE),
    n_agencies = n(),
    n_states = n_distinct(state_abbr),
    .groups = "drop"
  )

state_political_summary

# A tibble: 2 × 4
  state_political_leaning participation_rate n_agencies n_states
  <chr>                                <dbl>      <int>    <int>
1 Democratic                           0.697       9339       25
2 Republican                           0.871       9827       25

# Q2 metrics
# National median for adoption delay calculation
national_median <- agencies_cleaned %>%
  filter(is_nibrs == TRUE) %>%
  pull(nibrs_start_year) %>%
  median(na.rm = TRUE)

# Trends (Yearly & Cumulative)
q2_trends <- agencies_cleaned %>%
  filter(is_nibrs == TRUE, !is.na(nibrs_start_year)) %>%
  group_by(agency_type, nibrs_start_year) %>%
  summarise(n_new_adopters_year = n(), .groups = "drop") %>%
  arrange(agency_type, nibrs_start_year) %>%
  group_by(agency_type) %>%
  mutate(cumulative_adopters_year = cumsum(n_new_adopters_year)) %>%
  left_join(
    agencies_cleaned %>%
      group_by(agency_type) %>%
      summarise(total_agencies_type = n()),
    by = "agency_type"
  ) %>%
  mutate(
    cumulative_adoption_rate = cumulative_adopters_year / total_agencies_type
  )

# Adoption Timing Summary
q2_summary <- agencies_cleaned %>%
  filter(is_nibrs == TRUE, !is.na(nibrs_start_year)) %>%
  group_by(agency_type) %>%
  summarise(
    median_adoption_year = median(nibrs_start_year),
    mean_adoption_year = mean(nibrs_start_year),
    interquartile_range = IQR(nibrs_start_year),
    .groups = "drop"
  ) %>%
  # Add "Time to 50% Cumulative Adoption"
  left_join(
    q2_trends %>%
      filter(cumulative_adoption_rate >= 0.5) %>%
      group_by(agency_type) %>%
      summarise(time_to_50_pct_adoption = min(nibrs_start_year)),
    by = "agency_type"
  ) %>%
  mutate(adoption_delay = median_adoption_year - national_median)
q2_summary

# A tibble: 8 × 6
  agency_type        median_adoption_year mean_adoption_year interquartile_range
  <chr>                             <dbl>              <dbl>               <dbl>
1 City                               2016              2011.                  19
2 County                             2013              2010.                  20
3 Other                              2017              2013.                  16
4 Other State Agency                 2004              2008.                  12
5 State Police                       1998              2006.                  27
6 Tribal                             2021              2020.                   0
7 University Or Col…                 2019              2013.                  16
8 Unknown                            2021              2021.                   3
# ℹ 2 more variables: time_to_50_pct_adoption <dbl>, adoption_delay <dbl>

Introduction

Law enforcement agencies across the United States vary widely in size, jurisdiction, and reporting practices. Understanding how these agencies are distributed geographically and how broadly they participate in standardized crime reporting systems is important to interpret national crime data. In this report we wanted to explore a dataset of U.S. law enforcement agencies, examining patterns in agency type, location, and participation in the FBI’s National Incident-Based Reporting System (NIBRS) through a series of visualizations.

Question 1 <- Are certain types of law enforcement agencies systematically less likely to participate in NIBRS reporting, even within the same state?

Introduction

While the FBI’s National Incident-Based Reporting System (NIBRS) offers a standardized framework for tracking incidents, participation is far from universal. The question is whether certain types of agencies consistently fall behind, and whether that gap holds up even when comparing agencies operating under the same state policies.

If participation gaps persist within states, it points to something deeper than policy: structural differences in how certain kinds of agencies engage with standardized reporting. To investigate this, we draw on is_nibrs, agency_type, and state_abbr. With this we aim to see what agency types are systematically underrepresented and how complete and trustworthy US National crime data is.

Approach

To investigate whether certain agency types are systematically underrepresented in NIBRS, we built the analysis in three steps.

We started by calculating each agency type’s participation rate relative to its state’s overall rate, using is_nibrs and agency_type, and summarized these within-state gaps in an aggregated dot plot. Point size encodes the share of states in which a given agency type falls below the state baseline, capturing both the magnitude and consistency of the gap across the country.
We then brought in a choropleth map of overall participation rates by state using state_abbr, to contextualize how much of the variation might still be driven by geography and state-level policy.
Finally, we produced a faceted dot plot focusing on the eight most agency-dense states, where larger agency counts yield more reliable within-state estimates, to test whether the aggregate patterns hold locally or are smoothed-out artifacts of the national summary.

Analysis

# graph 1: how systematic is under-participation by agency type?
plot_systematic <- q1_metrics |>
  filter(is_reliable == TRUE) |>
  group_by(agency_type) |>
  summarise(
    avg_gap = mean(participation_gap, na.rm = TRUE),
    med_gap = median(participation_gap, na.rm = TRUE),
    pct_states_below = mean(participation_gap < 0, na.rm = TRUE),
    n_states = n_distinct(state_abbr),
    .groups = "drop"
  ) |>
  # drop Unknown if you want a cleaner story
  filter(agency_type != "Unknown") |>
  arrange(avg_gap) |>
  mutate(agency_type = fct_inorder(agency_type))

ggplot(plot_systematic, aes(x = avg_gap, y = agency_type)) +
  geom_vline(
    xintercept = 0,
    linetype = "dashed",
    color = "gray50",
    linewidth = 0.4
  ) +
  geom_point(aes(size = pct_states_below), alpha = 0.85) +
  geom_segment(
    aes(x = 0, xend = avg_gap, yend = agency_type),
    color = "gray70"
  ) +
  scale_x_continuous(labels = label_percent(accuracy = 1)) +
  scale_size_continuous(
    labels = label_percent(accuracy = 1),
    range = c(2, 8),
    name = "% of states below baseline"
  ) +
  labs(
    title = "Which agency types systematically lag NIBRS participation?",
    subtitle = "Average within-state participation gap across states (reliable groups only)",
    x = "Average participation gap vs state baseline",
    y = "Agency Type"
  ) +
  theme_minimal()

# graph 2: choropleth
state_map_data <- state_baseline |>
  mutate(region = tolower(state.name[match(state_abbr, state.abb)]))

us_map <- map_data("state")

map_plot_data <- left_join(us_map, state_map_data, by = "region")

ggplot(
  map_plot_data,
  aes(long, lat, group = group, fill = state_overall_participation_rate)
) +
  geom_polygon(color = "white", linewidth = 0.2) +
  coord_fixed(1.3) +
  scale_fill_viridis_c(
    labels = label_percent(),
    name = "Participation Rate"
  ) +
  labs(
    title = "State-level overall NIBRS participation rates",
    x = NULL,
    y = NULL
  ) +
  theme_void()

# graph 3 (better): participation gap vs state baseline
top_states <- agencies_cleaned |>
  count(state_abbr, sort = TRUE) |>
  slice_head(n = 8) |>
  pull(state_abbr)

plot_gap <- q1_metrics |>
  filter(state_abbr %in% top_states, is_reliable == TRUE) |>
  mutate(
    agency_type = fct_reorder(
      agency_type,
      participation_gap,
      .fun = median,
      .desc = FALSE
    )
  )

ggplot(plot_gap, aes(x = participation_gap, y = agency_type)) +
  geom_vline(
    xintercept = 0,
    linetype = "dashed",
    linewidth = 0.4,
    color = "gray50"
  ) +
  geom_point(
    aes(size = n_agencies, color = participation_gap > 0),
    alpha = 0.85
  ) +
  facet_wrap(~state_abbr, ncol = 2) +
  scale_x_continuous(labels = label_percent(accuracy = 1)) +
  scale_size_continuous(range = c(1.5, 6), breaks = c(5, 25, 100)) +
  scale_color_manual(
    values = c(`TRUE` = "#1f77b4", `FALSE` = "#d62728"),
    guide = "none"
  ) +
  labs(
    title = "NIBRS participation gap by agency type (within-state)",
    subtitle = "Each point shows agency-type participation minus the state's overall participation rate (top 8 states by agencies)",
    x = "Participation gap vs state baseline",
    y = "Agency Type",
    size = "Agencies in group"
  ) +
  theme_minimal()

Discussion

The aggregated dot plot reveals a consistent pattern: “Other”, “Other State Agency”, and State Police agencies participate at lower rates than the state baseline, while Tribal and County agencies tend to exceed it. Notably, the large dot sizes for the lagging categories indicate these gaps are not small-sample cases but widespread tendencies that hold across many states. One possible explanation is that catch-all agency types like “Other” lack the institutional infrastructure or federal pressure that more formally defined agencies face, making standardized reporting adoption slower and more uneven.

The choropleth map adds an important layer to this picture. Participation is largely a state-level story: most western states report near-universal adoption while Pennsylvania and Florida stand out as notable laggards. This confirms that state mandates and infrastructure play a significant role in driving overall participation rates, which is precisely why controlling for state was essential in the first place. Without that control, agency-type gaps could easily be mistaken for geographic ones.

The faceted chart then tests whether these patterns hold at the state level. Looking at the eight most agency-dense states, the negative gaps for State Police and “Other” categories visible in the aggregate do reappear, particularly in FL, NJ, and NY, giving credibility to the idea that these are structural rather than incidental patterns. At the same time, states like CA and IL show a messier picture where gaps are smaller and less consistent, suggesting that high agency density alone does not guarantee uniform adoption. The overall picture is that agency type shapes participation, but the strength of that relationship varies with the local policy and institutional environment each agency operates in.

Question 2 <- When did agencies adopt NIBRS reporting, and which agency types adopted later than others?

Introduction

The transition to NIBRS represents a fundamental shift in how the United States tracks crime, moving from aggregate counts to detailed, incident-level data. But this shift did not happen overnight, and it did not happen evenly. Some agencies signed on early, building a culture of standardized reporting long before it was required. Others waited, dragged their feet, or needed a federal push to move at all. This question is about mapping that journey: who were the pioneers, who were the procrastinators, and what does the timing of adoption tell us about the forces driving it?

To do this, we lean on nibrs_start_date to trace when agencies joined the system and agency_type to see whether certain kinds of agencies consistently moved faster or slower than others. We are particularly interested in whether the 2021 FBI federal mandate acted as a universal catalyst or simply formalized a divide that had been building for decades. For researchers relying on NIBRS data, these lags matter: an agency that only joined in 2021 left years of historical blind spots in the record.

Approach

To tell this story, we use two visualizations that each illuminate a different dimension of the adoption timeline.

The first is a cumulative line plot by agency type, tracking the share of agencies within each type that had joined NIBRS by any given year. Using cumulative rates rather than raw counts normalizes across agency types of different sizes, making adoption speed directly comparable. The slope of each line carries the story: a steep early rise signals a type that moved fast, while a line that only jumps around 2021 points to one that needed the federal mandate to act.
The second is a faceted area chart organized by state, shifting the lens from who lagged to where. Each state panel shows how its agencies accumulated NIBRS participation over time, with states ordered by median adoption year so that early adopters naturally rise to the top and late adopters fall to the bottom. The area format reinforces this visually, with slow-adopting states showing a long flat stretch before a sudden fill near the federal deadline.

Analysis

# 1. THE SECTOR STORY: MUNICIPAL LEADERS VS. SPECIALIZED LAGGARDS
q2_trends_clean <- q2_trends %>%
  filter(total_agencies_type > 50, !agency_type %in% c("Unknown", "Other"))

ggplot(
  q2_trends_clean,
  aes(x = nibrs_start_year, y = cumulative_adoption_rate, color = agency_type)
) +
  geom_line(size = 1.2) +
  scale_y_continuous(labels = scales::percent_format(), expand = c(0, 0)) +
  scale_color_brewer(palette = "Dark2") +
  labs(
    # The title  states the primary finding
    title = "The 2021 NIBRS Mandate Forced a Deadline Surge for Specialized Agencies",
    # The subtitle explains the contrast and provides "how"
    subtitle = "While Municipal and County police achieved steady adoption over a decade, Tribal and \nUniversity reporting remained stagnant until vertical spikes following the federal deadline.",
    x = "Year",
    y = "Total Adoption Rate",
    color = "Agency Type"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 14),
    panel.grid.major = element_line(color = "gray90"),
    panel.grid.minor = element_blank()
  )

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# 2. THE GEOGRAPHIC STORY: PIONEERS VS. PROCRASTINATORS
target_states <- c("VA", "MI", "WV", "CA", "NY", "FL", "TX", "GA", "IL")
primary_color <- "#2c7fb8"
state_story <- agencies_cleaned %>%
  filter(
    is_nibrs == TRUE,
    !is.na(nibrs_start_year),
    state_abbr %in% target_states
  ) %>%
  group_by(state_abbr, nibrs_start_year) %>%
  summarise(n_new = n(), .groups = "drop") %>%
  group_by(state_abbr) %>%
  mutate(cum_rate = cumsum(n_new) / sum(n_new)) %>%
  # Reorder states by median adoption year so 'Pioneers' appear first and 'Procrastinators' last
  mutate(state_abbr = fct_reorder(state_abbr, nibrs_start_year, .fun = median))

ggplot(state_story, aes(x = nibrs_start_year, y = cum_rate)) +
  geom_area(fill = primary_color, alpha = 0.2) +
  geom_line(color = primary_color, size = 1) +
  # Vertical line marks the 2021 mandate to explain the sudden spikes in NY/CA/FL
  geom_vline(
    xintercept = 2021,
    linetype = "dashed",
    color = "gray50",
    size = 0.5
  ) +
  facet_wrap(~state_abbr, nrow = 3) +
  scale_y_continuous(
    labels = scales::percent_format(),
    limits = c(0, 1),
    expand = c(0, 0)
  ) +
  labs(
    # Narrative Title
    title = "State Mandates and Federal Deadlines Erased the Geographic 'Reporting Gap'",
    subtitle = "While early adopters like Virginia and Michigan reached full NIBRS participation decades ago,\nlaggards like California and New York only transitioned once the 2021 federal deadline arrived.",
    x = "Year",
    y = "Adoption Progress"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(face = "bold", size = 11),
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(
      size = 12,
      color = "gray20",
      margin = margin(b = 15)
    ),
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank(),
    panel.spacing = unit(1.5, "lines")
  )

Discussion

(1-3 paragraphs) In the Discussion section, interpret the results of your analysis. Identify any trends revealed (or not revealed) by the plots. Speculate about why the data looks the way it does.

Presentation

Our presentation can be found here.

Data

Johnson, B., & R4DS Online Learning Community. (2025). FBI Crime Data API — Law enforcement agencies [Data file]. TidyTuesday (Week 7, 2025).Retrieved February 11, 2026, from https://github.com/rfordatascience/tidytuesday/blob/main/data/2025/2025-02-18/readme.md

Federal Bureau of Investigation. (2025). Crime Data Explorer: Agency-level data [Data set]. U.S. Department of Justice. https://cde.ucr.cjis.gov/LATEST/webapp/#/pages/docApi

References

Federal Bureau of Investigation. (2025). Crime Data Explorer: Agency-level data [Data set]. U.S. Department of Justice. https://cde.ucr.cjis.gov/LATEST/webapp/#/pages/docApi