billionaire_2001$location.region[billionaire_2001$location.region =="0"] <-NAbillionaire_2001$wealth.how.industry[billionaire_2001$wealth.how.industry =="0"] <-NAbillionaire_2001$demographics.age[billionaire_2001$demographics.age ==0] <-NAbillionaire_2001_age_wealth_outliner <- billionaire_2001 |>filter(name !="Bill Gates")ggplot(billionaire_2001_age_wealth_outliner,aes(x = age.when.founded.company, y = wealth.worth.in.billions)) +geom_point() +geom_smooth(method="lm") +labs(title="Billionaire's Age When They Founded Their Company vs. Their Wealth",x="Age When They Founded Their Company",y="Wealth Worth in Billions" ) +scale_y_continuous(labels =label_dollar())
#|label: age-wealth-correlation-and-linear-reg-outlier#|include: false# Correlation - age vs wealth - outlierage_wealth_corr_outlier <- billionaire_2001_age_wealth_outliner |>drop_na(age.when.founded.company, wealth.worth.in.billions) |>summarize(age_cor =cor(age.when.founded.company, wealth.worth.in.billions))age_wealth_corr_outlier
age_cor
1 -0.09624012
#linear regression - age vs wealth - outlierage_wealth_fit_outlier <-linear_reg() |>fit(wealth.worth.in.billions ~ age.when.founded.company, data = billionaire_2001_age_wealth_outliner)tidy(age_wealth_fit_outlier)
As we can see in the plot, we see that there is a billionaire that has a younger founding age (under 20) that has the most wealth (much more than others), we are wondering if this affects the result. Above the code is the process of deleting the outlier “Bill Gates” (the most wealthy billionaire). However, the correlation is still negative, and does not have much difference. Since deleting the outlier does not do much, we deleted it.
# A tibble: 538 × 11
# Groups: location.citizenship [46]
name demographics.age location.citizenship location.gdp location.region
<chr> <int> <chr> <dbl> <chr>
1 Bill Gates 45 United States 1.06e13 North America
2 Warren Bu… 70 United States 1.06e13 North America
3 Paul Allen 48 United States 1.06e13 North America
4 Larry Ell… 56 United States 1.06e13 North America
5 Jim Walton 53 United States 1.06e13 North America
6 John Walt… 55 United States 1.06e13 North America
7 S Robson … 57 United States 1.06e13 North America
8 Alice Wal… 52 United States 1.06e13 North America
9 Helen Wal… 81 United States 1.06e13 North America
10 Steven Ba… 44 United States 1.06e13 North America
# ℹ 528 more rows
# ℹ 6 more variables: wealth.worth.in.billions <dbl>,
# wealth.how.industry <chr>, wealth.how.inherited <chr>,
# age.when.founded.company <dbl>, n <int>, location.num.billionaires <int>
However, for Countries’ GDP vs Number of billionaires in that country analysis, the deleting the outlier does make it better. By looking at the plot, we can see that there is a country that has a very high GDP with a very high number of billionaires (above code is how we find the outlier).