问题
I have a line plot of some event at a hospital that I have been struggling with.
The challenges that I haven't solved yet are, 1) sorting the lines on the plot so that the patient-lines are sorted by Assessment-date, 2) coloring the lines by the variable 'openCase' and finally, 3) I would like to remove the Discharge-point (the blue square) for the cases that are in the year 2014 (or at some other random cut of date).
Any help would be appreciated?
Here is my sample data,
library(ggplot2)
library(plyr)
df <- data.frame(
date = seq(Sys.Date(), len= 156, by="5 day")[sample(156, 78)],
openCase = rep(0:1, 39),
patients = factor(rep(1:26, 3), labels = LETTERS)
)
df <- ddply(df, "patients", mutate, visit = order(date))
df$visit <- as.factor(df$visit)
levels(df$visit) <- c("Assessment (1)", "Treatment (2)", "Discharge (3)")
qplot(date, patients, data = df, geom = "line") +
geom_point(aes(colour = visit), size = 2, shape=0)
I'm aware that my example data is not perfect as some of the assessment datas is after the treatments and some of the discharge data is before the assessments data, but that part of the challenge that my base data is messed up.
What it looks like at the moment,

Update 2012-04-30 16:30:13 PDT
My data is delivered from a database and looks something like this,
df <- structure(list(date = structure(c(15965L, 15680L, 16135L, 15730L,
15920L, 15705L, 16110L, 15530L, 15575L, 15905L, 16140L, 15795L,
15955L, 15945L, 16205L, 15675L, 15525L, 15830L, 15625L, 15725L,
15855L, 15840L, 15615L, 15500L, 15780L, 15765L, 15610L, 15690L,
16080L, 15570L, 15685L, 16175L, 15740L, 15600L, 15985L, 15485L,
15605L, 16115L, 15535L, 15755L, 16145L, 16040L, 15970L, 16000L,
16075L, 15995L, 16010L, 15990L, 15665L, 15895L, 15865L, 16120L,
15880L, 15930L, 16055L, 15820L, 15650L, 16155L, 15700L, 15640L,
15505L, 15750L, 15800L, 15775L, 15825L, 15635L, 16150L, 15860L,
16100L, 15475L, 16050L, 15785L, 15495L, 15810L, 15805L, 15490L,
15460L, 16085L), class = "Date"), openCase = c(0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L), patients = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L,
6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L,
11L, 12L, 12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L,
16L, 16L, 16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L,
20L, 20L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L,
24L, 25L, 25L, 25L, 26L, 26L, 26L), .Label = c("A", "B", "C",
"D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"), class = "factor"),
visit = structure(c(2L, 1L, 3L, 3L, 1L, 2L, 2L, 3L, 1L, 3L,
1L, 2L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 2L, 3L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 1L,
3L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 3L, 2L, 2L, 3L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 1L, 1L,
3L, 2L, 1L, 3L, 2L, 2L, 1L, 3L), .Label = c("zym", "xov", "poi"
), class = "factor")), .Names = c("date", "openCase", "patients",
"visit"), row.names = c(NA, -78L), class = "data.frame")
The number of levels in visit
, and specific labeling, will most likely change so I would like some kind of code where I rank
or sort
based on my existing data instead (visit
) of generating new variables.
回答1:
I'm still not sure I understand what is wrong with @Ben's answer, but I'll try adding one of my own. Starting with the df
given in the edit.
Create a new variable Visit
(note the capital V) which is Assessment/Treatment/Discharge based on the ordering of the dates given. This is @Ben's code, just re-written.
df <- ddply(df, "patients", mutate,
Visit = factor(rank(date),
levels = 1:3,
labels=c("Assessment (1)", "Treatment (2)", "Discharge (3)")))
I don't understand how this relates to the visit
column in the data originally; in fact, the original visit
column is not used hereafter:
> table(df$Visit, df$visit)
zym xov poi
Assessment (1) 16 7 3
Treatment (2) 3 16 7
Discharge (3) 7 3 16
Reorder the patients (again copying Ben):
df$patients <- reorder(df$patients,df$date,function(x) min(as.numeric(x)))
Determine the subset of points that should be shown (same idea as Ben, but different code)
df2 <- df[!((df$Visit == "Discharge (3)") & (df$date > as.Date("2014-01-01"))),]
To add something new, here is a way to make the lines different colors without impacting the legend
ggplot(df, aes(date, patients)) +
geom_blank() +
geom_line(data = df[df$openCase == 0,], colour = "black") +
geom_line(data = df[df$openCase == 1,], colour = "red") +
geom_point(data = df2, aes(colour = Visit), size = 2, shape = 0)

回答2:
This is part-way:
Starting from after your initial definition of the data.
First, I think you want rank(date)
rather than order(date)
-- it made more sense to me, anyway.
df <- ddply(df, "patients", mutate, visit = rank(date))
df$visit <- as.factor(df$visit)
levels(df$visit) <- c("Assessment (1)", "Treatment (2)", "Discharge (3)")
Reorder patients by minimum date value (= Assessment date):
df$patients <- reorder(df$patients,df$date,function(x) min(as.numeric(x)))
Create a new data set missing the Discharge point, where they are after Jan 1 2014 (if you wanted to drop the Discharge point for cases that were assessed after a given date, you'd need to use ddply
):
df2 <- subset(df,!(visit=="Discharge (3)" & date > as.Date("2014-01-01")))
As @Joran pointed out above it's a bit hard to get two separate colour scales for different variables, but this sort-of works (you have to make openCase
into a factor in order to combine it with the colour scale for visit
)
ggplot(df, aes(date, patients)) + geom_line(aes(colour=factor(openCase))) +
geom_point(data=df2,aes(colour = visit), size = 2, shape=0)
Alternately (and I think this is prettier anyway), you could code openCase
with line type:
ggplot(df, aes(date, patients)) + geom_line(aes(linetype=factor(openCase))) +
geom_point(data=df2,aes(colour = visit), size = 2, shape=0)

来源:https://stackoverflow.com/questions/10359963/edits-in-a-ggplot2-geom-line