问题
I have a data frame that looks like this:
dat <- structure(list(Geocode = c("1100015", "1100023", "1100031", "1100049",
"1100056", "1100064", "1100072", "1100080", "1100098", "1100106",
"1100114", "1100122", "1100130", "1100148", "1100155", "1100189",
"1100205", "1100254", "1100262", "1100288", "1100296", "1100304",
"1100320", "1100338", "1100346", "1100379", "1100403", "1100452",
"1100502", "1100601"), Location = c("Alta Floresta D'oeste, RO",
"Ariquemes, RO", "Cabixi, RO", "Cacoal, RO", "Cerejeiras, RO",
"Colorado Do Oeste, RO", "Corumbiara, RO", "Costa Marques, RO",
"Espigo D'oeste, RO", "Guajar-Mirim, RO", "Jaru, RO", "Ji-Paran, RO",
"Machadinho D'oeste, RO", "Nova Brasilndia D'oeste, RO", "Ouro Preto Do Oeste, RO",
"Pimenta Bueno, RO", "Porto Velho, RO", "Presidente Mdici, RO",
"Rio Crespo, RO", "Rolim De Moura, RO", "Santa Luzia D'oeste, RO",
"Vilhena, RO", "So Miguel Do Guapor, RO", "Nova Mamor, RO", "Alvorada D'oeste, RO",
"Alto Alegre Dos Parecis, RO", "Alto Paraso, RO", "Buritis, RO",
"Novo Horizonte Do Oeste, RO", "Cacaulandia, RO"), Region = c("Norte",
"Norte", "Norte", "Norte", "Norte", "Norte", "Norte", "Norte",
"Norte", "Norte", "Sul", "Sul", "Sul", "Sul", "Sul",
"Sul", "Sul", "Sul", "Sul", "Sul", "Nordeste", "Nordeste",
"Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste",
"Nordeste"), Population = c(25578L, 104401L, 6355L, 87226L, 17986L,
18817L, 8842L, 16651L, 32385L, 46632L, 55738L, 130419L, 37167L,
21592L, 39924L, 37512L, 502748L, 22557L, 3750L, 56242L, 8532L,
91801L, 23933L, 27600L, 17063L, 13940L, 20210L, 37838L, 10276L,
6367L)), .Names = c("Geocode", "Location", "Region", "Population"
), row.names = c(NA, 30L), class = "data.frame")
It shows the population of some cities, as well as the region that the cities pertain to.
I need to classify the population into breaks (breaks=c(0,50000,100000)
), and then find the counts of cities according to the breaks, both as a whole (all regions) and separating by region.
The resulting data frame should look like this (random, hypothetical values):
Class Region Count
[0-50000] Norte 7
[50000-100000] Norte 3
[>100000] Norte 0
[0-50000] Sul 5
[50000-100000] Sul 4
[>100000] Sul 1
[0-50000] Nordeste 4
[50000-100000] Nordeste 5
[>100000] Nordeste 1
[0-50000] All 16
[50000-100000] All 12
[>100000] All 2
Any help appreciated.
回答1:
By using cut
and dplyr
dat$Class=cut(dat$Population,c(0,50000,100000,Inf),labels=c('0-50000','50000-100000','>100000'))
library(dplyr)
d1=dat%>%group_by(Class,Region)%>%summarise(count=n())
d2=dat%>%group_by(Class)%>%summarise(count=n(),Region='All')
bind_rows(d1,d2)
Class Region count
<fctr> <chr> <int>
1 0-50000 Nordeste 9
2 0-50000 Norte 8
3 0-50000 Sul 6
4 50000-100000 Nordeste 1
5 50000-100000 Norte 1
6 50000-100000 Sul 2
7 >100000 Norte 1
8 >100000 Sul 2
9 0-50000 All 23
10 50000-100000 All 4
11 >100000 All 3
回答2:
Here is a quick and dirty method, might update this later to make it more clean and avoid having to bind_rows()
Try the following:
library(tidyverse)
dat_1 <- dat %>%
mutate(population_breaks = case_when(Population <= 50000 ~ "0-50000",
Population >= 50000 & Population <= 100000 ~ "50000-100000",
Population >= 100000 ~ ">100000")) %>%
group_by(population_breaks) %>%
count(Region)
dat_2 <- dat %>%
mutate(population_breaks = case_when(Population <= 50000 ~ "0-50000",
Population >= 50000 & Population <= 100000 ~ "50000-100000",
Population >= 100000 ~ ">100000")) %>%
group_by(population_breaks) %>%
count(population_breaks) %>%
mutate(Region = "All")
bind_rows(dat_1, dat_2)
Which returns:
# A tibble: 11 x 3
# Groups: population_breaks [3]
population_breaks Region n
<chr> <chr> <int>
1 0-50000 Nordeste 9
2 50000-100000 Nordeste 1
3 >100000 Norte 1
4 0-50000 Norte 8
5 50000-100000 Norte 1
6 >100000 Sul 2
7 0-50000 Sul 6
8 50000-100000 Sul 2
9 >100000 All 3
10 0-50000 All 23
11 50000-100000 All 4
来源:https://stackoverflow.com/questions/47233841/r-cut-by-breaks-and-count-number-of-occurrences-by-group