Input 0 where there is no value in R data frame

两盒软妹~` 提交于 2020-02-04 22:57:25

问题


I want to get a data.frame like the one below, but including all years per topic. This one I made counts the number of items by year for each topic but when there is no item in some year, it just doesn't create that row for that particular topic, and it's blank in the final graph. Could anyone please tell me how to add the missing year with Count == 0 for the topics that have no value?

dtd2 <- structure(list(Topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("Topic 1", 
"Topic 10", "Topic 11", "Topic 12", "Topic 2", "Topic 3", "Topic 4", 
"Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"), class = "factor"), 
    Year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 2L, 
    3L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
    9L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
    8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 6L, 7L, 8L, 
    9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
    6L, 7L, 8L, 9L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("2011", 
    "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
    ), class = "factor"), Count = c(3L, 3L, 3L, 5L, 5L, 11L, 
    17L, 14L, 4L, 1L, 1L, 4L, 2L, 3L, 9L, 4L, 2L, 1L, 3L, 4L, 
    5L, 18L, 23L, 19L, 15L, 1L, 5L, 6L, 8L, 11L, 17L, 7L, 1L, 
    3L, 6L, 4L, 20L, 21L, 18L, 12L, 3L, 1L, 1L, 2L, 5L, 5L, 11L, 
    5L, 2L, 1L, 1L, 2L, 2L, 5L, 7L, 23L, 9L, 1L, 1L, 2L, 3L, 
    6L, 4L, 9L, 8L, 1L, 1L, 6L, 2L, 3L, 3L, 1L, 3L, 2L, 5L, 7L, 
    11L, 11L, 28L, 11L, 2L, 1L, 2L, 2L, 5L, 6L, 5L, 16L, 3L, 
    4L, 2L, 2L, 7L, 6L, 8L, 6L)), row.names = c(NA, -96L), class = "data.frame")

ggplot(dtd2, aes(x = Year, y = Count, colour = Topic, group = Topic)) + geom_point() + geom_line() + labs(x = "Year", y = NULL, title = "Timeline")

回答1:


A time series approach could be

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tsibble)
#> 
#> Attaching package: 'tsibble'
#> The following objects are masked from 'package:lubridate':
#> 
#>     interval, new_interval
#> The following object is masked from 'package:dplyr':
#> 
#>     id


dtd2 <- structure(list(Topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
  1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
  3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
  5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
  7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 
  10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 
  11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("Topic 1", 
    "Topic 10", "Topic 11", "Topic 12", "Topic 2", "Topic 3", "Topic 4", 
    "Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"), class = "factor"), 
  Year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 2L, 
    3L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
    9L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
    8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 6L, 7L, 8L, 
    9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
    6L, 7L, 8L, 9L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("2011", 
      "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
    ), class = "factor"), Count = c(3L, 3L, 3L, 5L, 5L, 11L, 
      17L, 14L, 4L, 1L, 1L, 4L, 2L, 3L, 9L, 4L, 2L, 1L, 3L, 4L, 
      5L, 18L, 23L, 19L, 15L, 1L, 5L, 6L, 8L, 11L, 17L, 7L, 1L, 
      3L, 6L, 4L, 20L, 21L, 18L, 12L, 3L, 1L, 1L, 2L, 5L, 5L, 11L, 
      5L, 2L, 1L, 1L, 2L, 2L, 5L, 7L, 23L, 9L, 1L, 1L, 2L, 3L, 
      6L, 4L, 9L, 8L, 1L, 1L, 6L, 2L, 3L, 3L, 1L, 3L, 2L, 5L, 7L, 
      11L, 11L, 28L, 11L, 2L, 1L, 2L, 2L, 5L, 6L, 5L, 16L, 3L, 
      4L, 2L, 2L, 7L, 6L, 8L, 6L)), row.names = c(NA, -96L), class = "data.frame")
tsibble2 <- dtd2 %>%
  mutate(Year = as_date(str_c(Year,"01",'01'))) %>% 
  as_tsibble(index = Year,key = Topic) %>%
  tsibble::fill_gaps(.full = TRUE) %>%
  group_by_key() %>% 
  index_by(year = Year %>% year) %>% 
  summarise(Count = Count %>% sum(na.rm = T)) %>% 
  as_tibble() %>% 
  mutate(year = year %>% as_factor())

tsibble2 %>% 
  ggplot() +
  aes(x = year,y = Count,color = Topic,group = Topic) +
  geom_line() +
  geom_point()

Created on 2020-01-08 by the reprex package (v0.3.0)




回答2:


We can use complete from tidyr to add missing years and fill Count values with 0.

tidyr::complete(dtd2, Topic, Year = unique(Year), fill = list(Count = 0))

#A tibble: 108 x 3
#   Topic    Year  Count
#   <fct>    <fct> <dbl>
# 1 Topic 1  2011      3
# 2 Topic 1  2012      3
# 3 Topic 1  2013      3
# 4 Topic 1  2014      5
# 5 Topic 1  2015      5
# 6 Topic 1  2016     11
# 7 Topic 1  2017     17
# 8 Topic 1  2018     14
# 9 Topic 1  2019      4
#10 Topic 10 2011      0
# … with 98 more rows

and use it in ggplot2 so that the lines are connected

library(ggplot2)
 tidyr::complete(dtd2, Topic, Year = unique(Year), fill = list(Count = 0)) %>%
   ggplot(., aes(x = Year, y = Count, colour = Topic, group = Topic)) + 
   geom_point() + geom_line() + labs(x = "Year", y = NULL, title = "Timeline")




回答3:


We can use expand

library(dplyr)
library(tidyr)
library(ggplot2)
dtd2 %>%
    expand(Topic = factor(Topic, levels = gtools::mixedsort(levels(Topic))) ,
                 Year = unique(Year)) %>% 
    left_join(dtd2) %>% 
    mutate(Count = replace_na(Count, 0)) %>%
    ggplot(aes(x = Year, y = Count, colour = Topic, group = Topic)) + 
         geom_point() +
         geom_line() +
         labs(x = "Year", y = NULL, title = "Timeline")

-output



来源:https://stackoverflow.com/questions/59655667/input-0-where-there-is-no-value-in-r-data-frame

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!