library(tidyverse)
library(flextable)
library(janitor)
state = read_csv("state.csv") %>%
clean_names() %>%
remove_empty()
# glimpse is part of the tydiverse package helps to get insights about the data frame : nr of column, nr of rows, features name, features data types
glimpse(state)
## Rows: 50
## Columns: 4
## $ state <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ population <dbl> 4779736, 710231, 6392017, 2915918, 37253956, 5029196, ...
## $ murder_rate <dbl> 5.7, 5.6, 4.7, 5.6, 4.4, 2.8, 2.4, 5.8, 5.8, 5.7, 1.8,...
## $ abbreviation <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", ...
# with the help of the summary function we can obtain a rough statistics of the features that we have to analyse for more in depth analysis see dlookr package
summary(state)
## state population murder_rate abbreviation
## Length:50 Min. : 563626 Min. : 0.900 Length:50
## Class :character 1st Qu.: 1833004 1st Qu.: 2.425 Class :character
## Mode :character Median : 4436370 Median : 4.000 Mode :character
## Mean : 6162876 Mean : 4.066
## 3rd Qu.: 6680312 3rd Qu.: 5.550
## Max. :37253956 Max. :10.300
state %>%
group_by(state) %>%
summarise(mean.pop = mean(population)) %>%
arrange(desc(mean.pop))
## # A tibble: 50 x 2
## state mean.pop
## <chr> <dbl>
## 1 California 37253956
## 2 Texas 25145561
## 3 New York 19378102
## 4 Florida 18801310
## 5 Illinois 12830632
## 6 Pennsylvania 12702379
## 7 Ohio 11536504
## 8 Michigan 9883640
## 9 Georgia 9687653
## 10 North Carolina 9535483
## # ... with 40 more rows
# The average of all values after dropping a fixed number of extreme values.
# in ou case we drop 10% of the extreme cases
state %>%
group_by(state) %>%
summarise(mean.trim = mean(population, trim = 0.1)) %>%
arrange(desc(mean.trim))
## # A tibble: 50 x 2
## state mean.trim
## <chr> <dbl>
## 1 California 37253956
## 2 Texas 25145561
## 3 New York 19378102
## 4 Florida 18801310
## 5 Illinois 12830632
## 6 Pennsylvania 12702379
## 7 Ohio 11536504
## 8 Michigan 9883640
## 9 Georgia 9687653
## 10 North Carolina 9535483
## # ... with 40 more rows
state %>%
group_by(state) %>%
summarise(mean.trim.murder = mean(murder_rate, trim = 0.1)) %>%
arrange(desc(mean.trim.murder))
## # A tibble: 50 x 2
## state mean.trim.murder
## <chr> <dbl>
## 1 Louisiana 10.3
## 2 Mississippi 8.6
## 3 Missouri 6.6
## 4 South Carolina 6.4
## 5 Maryland 6.1
## 6 Nevada 6
## 7 Delaware 5.8
## 8 Florida 5.8
## 9 Alabama 5.7
## 10 Georgia 5.7
## # ... with 40 more rows
state %>%
group_by(state) %>%
summarise(mean.pop = mean(population),
mean.trim.pop = mean(population, trim = 0.2)) %>%
arrange(desc(mean.pop))
## # A tibble: 50 x 3
## state mean.pop mean.trim.pop
## <chr> <dbl> <dbl>
## 1 California 37253956 37253956
## 2 Texas 25145561 25145561
## 3 New York 19378102 19378102
## 4 Florida 18801310 18801310
## 5 Illinois 12830632 12830632
## 6 Pennsylvania 12702379 12702379
## 7 Ohio 11536504 11536504
## 8 Michigan 9883640 9883640
## 9 Georgia 9687653 9687653
## 10 North Carolina 9535483 9535483
## # ... with 40 more rows
Claculate the median of the Population Feature
# The value such that one-half of the data lies above and below.
state %>%
group_by(state) %>%
summarise(median.pop = median(population)) %>%
arrange(desc(median.pop))
## # A tibble: 50 x 2
## state median.pop
## <chr> <dbl>
## 1 California 37253956
## 2 Texas 25145561
## 3 New York 19378102
## 4 Florida 18801310
## 5 Illinois 12830632
## 6 Pennsylvania 12702379
## 7 Ohio 11536504
## 8 Michigan 9883640
## 9 Georgia 9687653
## 10 North Carolina 9535483
## # ... with 40 more rows
glimpse(state)
## Rows: 50
## Columns: 4
## $ state <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ population <dbl> 4779736, 710231, 6392017, 2915918, 37253956, 5029196, ...
## $ murder_rate <dbl> 5.7, 5.6, 4.7, 5.6, 4.4, 2.8, 2.4, 5.8, 5.8, 5.7, 1.8,...
## $ abbreviation <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", ...
state %>%
group_by(state) %>%
summarise(median.murder = median(murder_rate),
median.pop = median(population)) %>%
arrange(desc(median.murder))
## # A tibble: 50 x 3
## state median.murder median.pop
## <chr> <dbl> <dbl>
## 1 Louisiana 10.3 4533372
## 2 Mississippi 8.6 2967297
## 3 Missouri 6.6 5988927
## 4 South Carolina 6.4 4625364
## 5 Maryland 6.1 5773552
## 6 Nevada 6 2700551
## 7 Delaware 5.8 897934
## 8 Florida 5.8 18801310
## 9 Alabama 5.7 4779736
## 10 Georgia 5.7 9687653
## # ... with 40 more rows
#install.packages("matrixStats")
library(matrixStats)
weighted.mean(state[['murder_rate']], w=state[['population']])
## [1] 4.445834
# In order to compute the average murder rate for the country, we need to use a weighted mean or median to account for different populations in the states.
state %>%
group_by(state) %>%
summarise(meanpop = weighted.mean(murder_rate, population)) %>%
arrange(desc(meanpop))
## # A tibble: 50 x 2
## state meanpop
## <chr> <dbl>
## 1 Louisiana 10.3
## 2 Mississippi 8.6
## 3 Missouri 6.6
## 4 South Carolina 6.4
## 5 Maryland 6.1
## 6 Nevada 6
## 7 Delaware 5.8
## 8 Florida 5.8
## 9 Alabama 5.7
## 10 Georgia 5.7
## # ... with 40 more rows
state %>%
summarise(meanpop = weighted.mean(murder_rate, population))
## # A tibble: 1 x 1
## meanpop
## <dbl>
## 1 4.45
state %>%
group_by(state) %>%
summarise(medianmurder = weightedMedian(murder_rate, population)) %>%
arrange(desc(medianmurder))
## # A tibble: 50 x 2
## state medianmurder
## <chr> <dbl>
## 1 Louisiana 10.3
## 2 Mississippi 8.6
## 3 Missouri 6.6
## 4 South Carolina 6.4
## 5 Maryland 6.1
## 6 Nevada 6
## 7 Delaware 5.8
## 8 Florida 5.8
## 9 Alabama 5.7
## 10 Georgia 5.7
## # ... with 40 more rows
state %>%
summarise(medianmurder = weightedMedian(murder_rate, population))
## # A tibble: 1 x 1
## medianmurder
## <dbl>
## 1 4.4
• The basic metric for location is the mean, but it can be sensitive to extreme values (outlier). • Other metrics (median, trimmed mean) are less sensitive to outliers and unusual distributions and hence are more robust.