Create a panel data frame

旧城冷巷雨未停 提交于 2019-12-01 08:55:18

1) reshape2 Create a grid g of all years and id values crossed and rbind it with frame.

Then using the reshape2 package cast frame from long to wide form and then melt it back to long form. Finally rearrange the rows and columns as desired.

The lines ending in one # are only to ensure that every year is present so if we knew that were the case those lines could be omitted. The line ending in ## is only to rearrange the rows and columns so if that did not matter that line could be omitted too.

library(reshape2)

g <- with(frame, expand.grid(year = seq(min(year), max(year)), id = unique(id), y = 0)) #
frame <- rbind(frame, g) #

wide <- dcast(frame, year ~ id, fill = 0, fun = sum, value.var = "y")
long <- melt(wide, id = "year", variable.name = "id", value.name = "y")

long <- long[order(long$id, long$year), c("id", "year", "y")] ##

giving:

> long
   id year y
1   1 2005 1
2   1 2006 0
3   1 2007 0
4   1 2008 0
5   2 2005 0
6   2 2006 0
7   2 2007 0
8   2 2008 0
9   3 2005 0
10  3 2006 0
11  3 2007 0
12  3 2008 0
13  4 2005 0
14  4 2006 0
15  4 2007 1
16  4 2008 0

2) aggregate A shorter solution would be to run just the two lines that end with # above and then follow those with an aggregate as shown. This solution uses no addon packages.

g <- with(frame, expand.grid(year = seq(min(year), max(year)), id = unique(id), y = 0)) #
frame <- rbind(frame, g) # 

aggregate(y ~ year + id, frame, sum)[c("id", "year", "y")]

This gives the same answer as solution (1) except as noted by a commenter solution (1) above makes id a factor whereas it is not in this solution.

Using data.table:

require(data.table)
DT <- data.table(frame, key=c("id", "year"))
comb <- CJ(1:4, 2005:2008) # like 'expand.grid', but faster + sets key
ans <- DT[comb][is.na(y), y:=0L] # perform a join (DT[comb]), then set NAs to 0
#     id year y
#  1:  1 2005 1
#  2:  1 2006 0
#  3:  1 2007 0
#  4:  1 2008 0
#  5:  2 2005 0
#  6:  2 2006 0
#  7:  2 2007 0
#  8:  2 2008 0
#  9:  3 2005 0
# 10:  3 2006 0
# 11:  3 2007 0
# 12:  3 2008 0
# 13:  4 2005 0
# 14:  4 2006 0
# 15:  4 2007 1
# 16:  4 2008 0

maybe not an elegant solution, but anyway:

df <- expand.grid(id=id, year=unique(year))
frame <- frame[frame$y != 0,]
df$y <- 0
df2 <- rbind(frame, df)
df2 <- df2[!duplicated(df2[,c("id", "year")]),]
df2 <- df2[order(df2$id, df2$year),]
rownames(df2) <- NULL
df2
# id year y
# 1   1 2005 1
# 2   1 2006 0
# 3   1 2007 0
# 4   1 2008 0
# 5   2 2005 0
# 6   2 2006 0
# 7   2 2007 0
# 8   2 2008 0
# 9   3 2005 0
# 10  3 2006 0
# 11  3 2007 0
# 12  3 2008 0
# 13  4 2005 0
# 14  4 2006 0
# 15  4 2007 1
# 16  4 2008 0
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!