How to subset consecutive rows if they meet a condition

房东的猫 提交于 2019-12-05 03:36:16

An approach with data.table which is slightly different from @jlhoward's approach (using the same data):

library(data.table)

setDT(df)
df[, hotday := +(MAX>=44.5 & MIN>=24.5)
   ][, hw.length := with(rle(hotday), rep(lengths,lengths))
     ][hotday == 0, hw.length := 0]

this produces a datatable with a heat wave length variable (hw.length) instead of a TRUE/FALSE variable for a specific heat wave length:

> df
    YEAR MONTH DAY  MAX  MIN hotday hw.length
 1: 1989     7  18 45.0 23.5      0         0
 2: 1989     7  19 44.2 26.1      0         0
 3: 1989     7  20 44.7 24.4      0         0
 4: 1989     7  21 44.6 29.5      1         1
 5: 1989     7  22 44.4 31.6      0         0
 6: 1989     7  23 44.2 26.7      0         0
 7: 1989     7  24 44.5 25.0      1         3
 8: 1989     7  25 44.8 26.0      1         3
 9: 1989     7  26 44.8 24.6      1         3
10: 1989     7  27 45.0 24.3      0         0
11: 1989     7  28 44.8 26.0      1         1
12: 1989     7  29 44.4 24.0      0         0
13: 1989     7  30 45.2 25.0      1         1

I may be missing something here but I don't see the point of subsetting beforehand. If you have data for every day, in chronological order, you can use run length encoding (see the docs on the rle(...) function).

In this example we create an artificial data set and define "heat wave" as MAX >= 44.5 and MIN >= 24.5. Then:

# example data set
df <- data.frame(YEAR=1989, MONTH=7, DAY=18:30, 
                 MAX=c(45, 44.2, 44.7, 44.6, 44.4, 44.2, 44.5, 44.8, 44.8, 45, 44.8, 44.4, 45.2),
                 MIN=c(23.5, 26.1, 24.4, 29.5, 31.6, 26.7, 25, 26, 24.6, 24.3, 26, 24, 25))

r <- with(with(df, rle(MAX>=44.5 & MIN>=24.5)),rep(lengths,lengths))
df$heat.wave <- with(df,MAX>=44.5&MIN>=24.5) & (r>2)
df
#    YEAR MONTH DAY  MAX  MIN heat.wave
# 1  1989     7  18 45.0 23.5     FALSE
# 2  1989     7  19 44.2 26.1     FALSE
# 3  1989     7  20 44.7 24.4     FALSE
# 4  1989     7  21 44.6 29.5     FALSE
# 5  1989     7  22 44.4 31.6     FALSE
# 6  1989     7  23 44.2 26.7     FALSE
# 7  1989     7  24 44.5 25.0      TRUE
# 8  1989     7  25 44.8 26.0      TRUE
# 9  1989     7  26 44.8 24.6      TRUE
# 10 1989     7  27 45.0 24.3     FALSE
# 11 1989     7  28 44.8 26.0     FALSE
# 12 1989     7  29 44.4 24.0     FALSE
# 13 1989     7  30 45.2 25.0     FALSE

This creates a column, heat.wave which is TRUE if there was a heat wave on that day. If you need to extract only the hw days, use

df[df$heat.wave,]
#   YEAR MONTH DAY  MAX  MIN heat.wave
# 7 1989     7  24 44.5 25.0      TRUE
# 8 1989     7  25 44.8 26.0      TRUE
# 9 1989     7  26 44.8 24.6      TRUE

Your question really boils down to finding groupings of 3+ consecutive days in your subsetted dataset, removing all remaining data.

Let's consider an example where we would want to keep some rows and remove others:

dat <- data.frame(year = 1989, month=c(6, 7, 7, 7, 7, 7, 8, 8, 8, 10, 10), day=c(12, 11, 12, 13, 14, 21, 5, 6, 7, 12, 13))
dat
#    year month day
# 1  1989     6  12
# 2  1989     7  11
# 3  1989     7  12
# 4  1989     7  13
# 5  1989     7  14
# 6  1989     7  21
# 7  1989     8   5
# 8  1989     8   6
# 9  1989     8   7
# 10 1989    10  12
# 11 1989    10  13

I've excluded the temperature data, because I'm assuming we've already subsetted to just the days that exceed the 90th percentile using the code from your question.

In this dataset there is a 4-day heat wave in July and a three-day heat wave in August. The first step would be to convert the data to date objects and compute the number of days between consecutive observations (I assume the data is already ordered by day here):

dates <- as.Date(paste(dat$year, dat$month, dat$day, sep="-"))
(dd <- as.numeric(difftime(tail(dates, -1), head(dates, -1), units="days")))
# [1] 29  1  1  1  7 15  1  1 66  1

We're close, because now we can see the time periods where there were multiple date gaps of 1 day -- these are the ones we want to grab. We can use the rle function to analyze runs of the number 1, keeping only the runs of length 2 or more:

(valid.gap <- with(rle(dd == 1), rep(values & lengths >= 2, lengths)))
# [1] FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE

Finally, we can subset the dataset to just the days that were on either side of a 1-day date gap that is part of a heat wave:

dat[c(FALSE, valid.gap) | c(valid.gap, FALSE),]
#   year month day
# 2 1989     7  11
# 3 1989     7  12
# 4 1989     7  13
# 5 1989     7  14
# 7 1989     8   5
# 8 1989     8   6
# 9 1989     8   7

A simple approach, not full vectorized..

# play data
year <- c("1960")
month <- c(rep(1,30), rep(2,30), rep(3,30))
day <- rep(1:30,3)
maxT <- round(runif(90, 20, 22),1)
minT <- round(runif(90, 10, 12),1)

df <- data.frame(year, month, day, maxT, minT)

# target and tricky data...
df[1:3, 4] <- 30
df[1:4, 5] <- 14
df[10:13, 4] <- 30
df[10:11, 5] <- 14

# limits
df$maxTope <- df$maxT - quantile(df$maxT,0.9)
df$minTope <- df$minT - quantile(df$minT,0.9)

# define heat day
df$heat <- ifelse(df$maxTope > 0 & df$minTope >0, 1, 0)

# count heat day2
for(i in 2:dim(df)[1]){ 
    df$count[1] <- ifelse(df$heat[1] == 1, 1, 0)
    df$count[i] <- ifelse(df$heat[i] == 1, df$count[i-1]+1, 0)
}

# select last day of heat wave (and show the number of days in $count)
df[which(df$count >= 3),]

Here's a quick little solution:

is_High_Temp <- ((quantile(Mydata$MAX,.9)) &
                    Mydata$MIN >= (quantile(Mydata$MIN,.9)))
start_of_a_series <- c(T,is_High_Temp[-1] != is_High_Temp[-length(x)]) # this is the tricky part
series_number <- cumsum(start_of_a_series) 
series_length <- ave(series_number,series_number,FUN=length())
is_heat_wave  <-  series_length >= 3 & is_High_Temp 
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!