问题
I have a R tidy dataset my_durations
where each case in the data frame corresponds to a sample taken over a duration of time like so:
> glimpse(my_durations)
Observations: 300
Variables: 5
$ sample_id <int> 2, 8, 25, 41, 59, 70, 98, 100, 105, 106, 108, 114, 119, 126,...
$ site_id <int> 2, 13, 12, 23, 47, 23, 66, 72, 72, 50, 50, 54, 45, 73, 48, 7...
$ start_date <dttm> 2015-04-12, 2015-06-10, 2015-07-02, 2015-07-22, 2015-07-29,...
$ end_date <dttm> 2015-05-14, 2015-06-18, 2015-07-08, 2015-07-24, 2015-07-30,...
$ duration <time> 32 days, 8 days, 6 days, 2 days, 1 days, 4 days, 12 days, 2...
Where sample_id
is the unique ID for that sample, site_id
is just an ID for keeping track of where the sample was taken, start_date
and end_date
are when the sampling began and ended, and duration
is simply the difference in time between start_date
and end_date
.
Here is the full dput()
for the dataset:
structure(list(sample_id = c(2L, 8L, 25L, 41L, 59L, 70L, 98L,
100L, 105L, 106L, 108L, 114L, 119L, 126L, 128L, 146L, 151L, 164L,
167L, 169L, 175L, 190L, 198L, 200L, 222L, 237L, 254L, 273L, 276L,
280L, 281L, 290L, 300L, 305L, 314L, 345L, 354L, 371L, 376L, 379L,
380L, 382L, 383L, 389L, 401L, 410L, 413L, 424L, 439L, 466L, 469L,
476L, 482L, 484L, 499L, 505L, 517L, 538L, 580L, 582L, 583L, 584L,
635L, 650L, 655L, 658L, 662L, 671L, 674L, 702L, 709L, 710L, 712L,
715L, 716L, 724L, 734L, 735L, 738L, 785L, 789L, 793L, 794L, 803L,
833L, 856L, 859L, 865L, 866L, 888L, 895L, 898L, 900L, 907L, 938L,
979L, 980L, 988L, 991L, 1009L, 1026L, 1031L, 1034L, 1050L, 1058L,
1061L, 1063L, 1066L, 1069L, 1077L, 1081L, 1091L, 1092L, 1099L,
1100L, 1108L, 1115L, 1119L, 1143L, 1149L, 1158L, 1180L, 1190L,
1195L, 1198L, 1207L, 1231L, 1234L, 1236L, 1242L, 1249L, 1250L,
1271L, 1288L, 1294L, 1311L, 1312L, 1313L, 1319L, 1337L, 1341L,
1345L, 1349L, 1360L, 1374L, 1379L, 1389L, 1393L, 1396L, 1401L,
1404L, 1407L, 1422L, 1434L, 1438L, 1448L, 1454L, 1463L, 1473L,
1489L, 1508L, 1514L, 1518L, 1531L, 1551L, 1564L, 1565L, 1571L,
1572L, 1597L, 1602L, 1619L, 1624L, 1629L, 1630L, 1659L, 1661L,
1666L, 1669L, 1672L, 1678L, 1690L, 1697L, 1700L, 1703L, 1707L,
1715L, 1719L, 1725L, 1732L, 1737L, 1739L, 1754L, 1771L, 1788L,
1790L, 1796L, 1799L, 1802L, 1805L, 1813L, 1814L, 1832L, 1839L,
1844L, 1848L, 1873L, 1887L, 1893L, 1900L, 1901L, 1917L, 1920L,
1939L, 1948L, 1954L, 1956L, 1968L, 1971L, 1975L, 1979L, 2008L,
2015L, 2019L, 2021L, 2028L, 2035L, 2048L, 2062L, 2071L, 2072L,
2075L, 2085L, 2090L, 2091L, 2100L, 2106L, 2115L, 2172L, 2178L,
2181L, 2221L, 2225L, 2228L, 2230L, 2231L, 2237L, 2241L, 2265L,
2266L, 2271L, 2282L, 2284L, 2311L, 2319L, 2337L, 2377L, 2405L,
2409L, 2412L, 2429L, 2434L, 2460L, 2483L, 2485L, 2488L, 2490L,
2500L, 2513L, 2520L, 2521L, 2527L, 2539L, 2555L, 2569L, 2599L,
2605L, 2610L, 2635L, 2640L, 2641L, 2656L, 2667L, 2689L, 2705L,
2720L, 2747L, 2753L, 2756L, 2761L, 2769L, 2772L, 2809L, 2816L,
2818L, 2821L, 2823L, 2828L, 2837L, 2838L), site_id = c(2L, 13L,
12L, 23L, 47L, 23L, 66L, 72L, 72L, 50L, 50L, 54L, 45L, 73L, 48L,
73L, 84L, 85L, 85L, 52L, 66L, 73L, 76L, 95L, 61L, 73L, 106L,
72L, 108L, 90L, 91L, 44L, 103L, 90L, 108L, 105L, 122L, 131L,
133L, 133L, 133L, 133L, 133L, 52L, 138L, 136L, 113L, 146L, 55L,
147L, 113L, 151L, 147L, 117L, 74L, 160L, 55L, 73L, 74L, 73L,
151L, 73L, 169L, 168L, 73L, 73L, 44L, 73L, 182L, 74L, 73L, 105L,
160L, 184L, 184L, 74L, 74L, 73L, 113L, 199L, 73L, 202L, 198L,
73L, 199L, 74L, 73L, 74L, 74L, 198L, 213L, 212L, 213L, 44L, 160L,
221L, 218L, 230L, 226L, 201L, 74L, 73L, 230L, 184L, 161L, 74L,
74L, 73L, 214L, 74L, 73L, 73L, 74L, 73L, 74L, 73L, 74L, 74L,
226L, 73L, 74L, 74L, 201L, 201L, 73L, 74L, 242L, 226L, 74L, 74L,
113L, 73L, 249L, 73L, 249L, 74L, 247L, 240L, 73L, 74L, 44L, 73L,
201L, 74L, 74L, 191L, 73L, 254L, 201L, 248L, 237L, 260L, 73L,
226L, 74L, 191L, 226L, 259L, 73L, 226L, 74L, 237L, 74L, 248L,
275L, 276L, 276L, 277L, 277L, 260L, 280L, 280L, 160L, 244L, 262L,
74L, 44L, 74L, 44L, 73L, 74L, 73L, 73L, 74L, 74L, 73L, 74L, 73L,
244L, 74L, 73L, 105L, 74L, 74L, 294L, 73L, 223L, 223L, 248L,
295L, 73L, 74L, 74L, 295L, 73L, 269L, 73L, 201L, 199L, 74L, 74L,
74L, 271L, 292L, 105L, 292L, 199L, 267L, 292L, 305L, 74L, 74L,
295L, 309L, 74L, 310L, 310L, 271L, 316L, 74L, 73L, 305L, 73L,
113L, 74L, 74L, 73L, 191L, 74L, 245L, 226L, 321L, 241L, 320L,
113L, 323L, 73L, 320L, 73L, 74L, 74L, 73L, 73L, 191L, 74L, 73L,
74L, 74L, 74L, 73L, 245L, 113L, 73L, 16L, 73L, 348L, 350L, 245L,
306L, 191L, 245L, 350L, 244L, 348L, 113L, 191L, 306L, 73L, 73L,
306L, 350L, 73L, 361L, 245L, 73L, 114L, 191L, 73L, 357L, 361L,
376L, 364L, 360L, 378L, 357L, 73L, 380L, 73L, 350L, 364L), start_date = structure(c(1428796800,
1433894400, 1435795200, 1437523200, 1438128000, 1438300800, 1437609600,
1438905600, 1438905600, 1438041600, 1438041600, 1438646400, 1438560000,
1439424000, 1438819200, 1440115200, 1439856000, 1440115200, 1440115200,
1438041600, 1440460800, 1441497600, 1440547200, 1441238400, 1438992000,
1442707200, 1443225600, 1439337600, 1440633600, 1442707200, 1442707200,
1444089600, 1442534400, 1444348800, 1443225600, 1444694400, 1445817600,
1445472000, 1446854400, 1446854400, 1446854400, 1446854400, 1446854400,
1441584000, 1447459200, 1447372800, 1444348800, 1447977600, 1448064000,
1448064000, 1446940800, 1449014400, 1448064000, 1445904000, 1449878400,
1449792000, 1449878400, 1451001600, 1452729600, 1452902400, 1452470400,
1452988800, 1453075200, 1454889600, 1455235200, 1455408000, 1454976000,
1455753600, 1453766400, 1456963200, 1457308800, 1456876800, 1456876800,
1456790400, 1457395200, 1457827200, 1458086400, 1458172800, 1455580800,
1460073600, 1460419200, 1456617600, 1460073600, 1460851200, 1460073600,
1462233600, 1462320000, 1462492800, 1462665600, 1460073600, 1462579200,
1462492800, 1462579200, 1462838400, 1463443200, 1463702400, 1463616000,
1464912000, 1464825600, 1465171200, 1466035200, 1466121600, 1464912000,
1461888000, 1464652800, 1466467200, 1466553600, 1466640000, 1462579200,
1466726400, 1466985600, 1467331200, 1467331200, 1467590400, 1467590400,
1467936000, 1468108800, 1468281600, 1468368000, 1469145600, 1469404800,
1470009600, 1470009600, 1470009600, 1470441600, 1470787200, 1471219200,
1470096000, 1470268800, 1470873600, 1467590400, 1471564800, 1471478400,
1472256000, 1471478400, 1472515200, 1471219200, 1472256000, 1472601600,
1472860800, 1472688000, 1473292800, 1472947200, 1473638400, 1474243200,
1474156800, 1475193600, 1475193600, 1474761600, 1475193600, 1471046400,
1475193600, 1476316800, 1473724800, 1476748800, 1476403200, 1476748800,
1477785600, 1478044800, 1477958400, 1479168000, 1478304000, 1479254400,
1473811200, 1477699200, 1476576000, 1476576000, 1477872000, 1478476800,
1475193600, 1477094400, 1477094400, 1479859200, 1479340800, 1475884800,
1480896000, 1480464000, 1480982400, 1480982400, 1481241600, 1481846400,
1482192000, 1482451200, 1482537600, 1482624000, 1482969600, 1483228800,
1483401600, 1481846400, 1483747200, 1483920000, 1483488000, 1484438400,
1484956800, 1485216000, 1485388800, 1473292800, 1478995200, 1485216000,
1485216000, 1485907200, 1485907200, 1486339200, 1485216000, 1486512000,
1485216000, 1487116800, 1487030400, 1485388800, 1487721600, 1487808000,
1488153600, 1487289600, 1485129600, 1488240000, 1485129600, 1485388800,
1480896000, 1485129600, 1488412800, 1489104000, 1490054400, 1485216000,
1490054400, 1490400000, 1490572800, 1490659200, 1489622400, 1489881600,
1491436800, 1491523200, 1488412800, 1491782400, 1488758400, 1491868800,
1492473600, 1492646400, 1492387200, 1494633600, 1494288000, 1494288000,
1495152000, 1494201600, 1494979200, 1491868800, 1495065600, 1496102400,
1494979200, 1497052800, 1497052800, 1497225600, 1497657600, 1497744000,
1498435200, 1499212800, 1499904000, 1501372800, 1502409600, 1502582400,
1502668800, 1502236800, 1501718400, 1504569600, 1502841600, 1505174400,
1503878400, 1503964800, 1505260800, 1503964800, 1505606400, 1505865600,
1503964800, 1504656000, 1503878400, 1505520000, 1508716800, 1503964800,
1509580800, 1510704000, 1503964800, 1503964800, 1511481600, 1508889600,
1512518400, 1513987200, 1513555200, 1514764800, 1516665600, 1515456000,
1508889600, 1517097600, 1511654400, 1510012800, 1518393600, 1515456000,
1519257600, 1518825600, 1519344000, 1503964800, 1511654400), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), end_date = structure(c(1431561600,
1434585600, 1436313600, 1437696000, 1438214400, 1438646400, 1438646400,
1439078400, 1439078400, 1438128000, 1438128000, 1439164800, 1438905600,
1439942400, 1438992000, 1440288000, 1440460800, 1440720000, 1440720000,
1438128000, 1440547200, 1441584000, 1441238400, 1441670400, 1439769600,
1443052800, 1444003200, 1439769600, 1441324800, 1444348800, 1444348800,
1444694400, 1444521600, 1444867200, 1445126400, 1445212800, 1446336000,
1445558400, 1447113600, 1447286400, 1447372800, 1447545600, 1447632000,
1442707200, 1447545600, 1448236800, 1444608000, 1448236800, 1449014400,
1449273600, 1449273600, 1449532800, 1449446400, 1445990400, 1450051200,
1450137600, 1450396800, 1451174400, 1452902400, 1452988800, 1453075200,
1453075200, 1454716800, 1455148800, 1455321600, 1455494400, 1455580800,
1455840000, 1454976000, 1457049600, 1457395200, 1457395200, 1457395200,
1457395200, 1457481600, 1457913600, 1458172800, 1458345600, 1458518400,
1460419200, 1460592000, 1457222400, 1460592000, 1460937600, 1461801600,
1462320000, 1462406400, 1462665600, 1462752000, 1462924800, 1462924800,
1463097600, 1463270400, 1463443200, 1464134400, 1464912000, 1464912000,
1465084800, 1465257600, 1465776000, 1466121600, 1466208000, 1466208000,
1463702400, 1466553600, 1466553600, 1466640000, 1466726400, 1466726400,
1466812800, 1467072000, 1467417600, 1467417600, 1467676800, 1467676800,
1468022400, 1468195200, 1468368000, 1468972800, 1469232000, 1469491200,
1470096000, 1470614400, 1470614400, 1470528000, 1470873600, 1471305600,
1471305600, 1470355200, 1471219200, 1471564800, 1471651200, 1471910400,
1472342400, 1472083200, 1472601600, 1472601600, 1472601600, 1472688000,
1473120000, 1473206400, 1473379200, 1473638400, 1473811200, 1474502400,
1474761600, 1475280000, 1475366400, 1475452800, 1475712000, 1472515200,
1475884800, 1476403200, 1475625600, 1476835200, 1477180800, 1477440000,
1477872000, 1478131200, 1478649600, 1479254400, 1479513600, 1479600000,
1476316800, 1478476800, 1477526400, 1477699200, 1478304000, 1478649600,
1476576000, 1477699200, 1480204800, 1480464000, 1480636800, 1476403200,
1480982400, 1480982400, 1481241600, 1481587200, 1481673600, 1481932800,
1482278400, 1482537600, 1482624000, 1482796800, 1483056000, 1483315200,
1483488000, 1483660800, 1483833600, 1484006400, 1484006400, 1484524800,
1485043200, 1485388800, 1485475200, 1474761600, 1480809600, 1485734400,
1485734400, 1485993600, 1485993600, 1486425600, 1486425600, 1486598400,
1486684800, 1487203200, 1487462400, 1487635200, 1487808000, 1487894400,
1488240000, 1488240000, 1488672000, 1488844800, 1488931200, 1488931200,
1485648000, 1489449600, 1489449600, 1489536000, 1490140800, 1490313600,
1490400000, 1490572800, 1490659200, 1490745600, 1491004800, 1489968000,
1491523200, 1491609600, 1491609600, 1491868800, 1491868800, 1492128000,
1492560000, 1492732800, 1492992000, 1494720000, 1494892800, 1494979200,
1495670400, 1495843200, 1495670400, 1495929600, 1495324800, 1496188800,
1496188800, 1497139200, 1497139200, 1497312000, 1497744000, 1497830400,
1498953600, 1499299200, 1499990400, 1501459200, 1502496000, 1502668800,
1502755200, 1503446400, 1503446400, 1504656000, 1503100800, 1505260800,
1505260800, 1505260800, 1505865600, 1506211200, 1506297600, 1506470400,
1506470400, 1507075200, 1507248000, 1508025600, 1509235200, 1509321600,
1509753600, 1510790400, 1510790400, 1510790400, 1511568000, 1511913600,
1513123200, 1514160000, 1514764800, 1516320000, 1516752000, 1516838400,
1516838400, 1517184000, 1517270400, 1518480000, 1518998400, 1519171200,
1519344000, 1519344000, 1519430400, 1519689600, 1519689600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), duration = structure(c(32, 8, 6, 2,
1, 4, 12, 2, 2, 1, 1, 6, 4, 6, 2, 2, 7, 7, 7, 1, 1, 1, 8, 5,
9, 4, 9, 5, 8, 19, 19, 7, 23, 6, 22, 6, 6, 1, 3, 5, 6, 8, 9,
13, 1, 10, 3, 3, 11, 14, 27, 6, 16, 1, 2, 4, 6, 2, 2, 1, 7, 1,
19, 3, 1, 1, 7, 1, 14, 1, 1, 6, 6, 7, 1, 1, 1, 2, 34, 4, 2, 7,
6, 1, 20, 1, 1, 2, 1, 33, 4, 7, 8, 7, 8, 14, 15, 2, 5, 7, 1,
1, 15, 21, 22, 1, 1, 1, 48, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1,
1, 1, 7, 7, 1, 1, 1, 14, 1, 4, 46, 1, 5, 1, 7, 1, 16, 4, 1, 3,
6, 1, 8, 2, 3, 7, 1, 2, 8, 6, 17, 8, 1, 22, 1, 9, 8, 1, 1, 8,
1, 14, 4, 29, 9, 11, 13, 5, 2, 16, 7, 36, 7, 15, 6, 1, 6, 3,
7, 5, 1, 1, 1, 1, 2, 1, 1, 1, 21, 1, 1, 6, 1, 1, 2, 1, 17, 21,
6, 6, 1, 1, 1, 14, 1, 17, 1, 5, 26, 1, 1, 1, 11, 41, 7, 44, 41,
55, 50, 12, 5, 1, 59, 4, 2, 1, 1, 16, 1, 1, 1, 37, 1, 36, 3,
1, 1, 7, 1, 7, 8, 6, 19, 8, 47, 3, 1, 14, 1, 1, 1, 1, 1, 6, 1,
1, 1, 1, 1, 1, 14, 20, 1, 3, 1, 16, 15, 7, 26, 8, 7, 29, 28,
39, 29, 6, 62, 2, 1, 79, 79, 1, 35, 7, 2, 14, 18, 1, 16, 92,
1, 65, 98, 7, 43, 1, 6, 1, 182, 93), class = "difftime", units = "days")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -300L))
The challenge I am facing now is that for any duration
that's more than n
days (let's go with n == 7
for now), I want to "split" that duration into n
-day long segments. For example, for sample_id == 2
, the duration
is 32 days, so I want to split that into four 7-day segments, plus one 4-day segment. In the end, the original row for sample_id == 2
would turn into five rows each with start_date
, end_date
, and duration
that correspond to each of the five segments. I would like to have a new column called segment_id
to identify each of the newly-created segments while keeping all of the original columns. BTW, for the sample_id
s that have original duration
s shorter than n
, I want to keep them as is, and they would get segment_id == 1
.
I am stumped. Is there a fairly "tidy" way to achieve this? Thank you very much in advance.
回答1:
We can create columns of lists and unnest
them:
library(tidyverse)
library(lubridate)
df %>% group_by(sample_id,site_id) %>%
mutate(duration_new = (as.numeric(duration)-1) %>% seq(0,.,by=7) %>% c(duration) %>% diff %>% list,
start_date_new = list(start_date + days(c(0,cumsum(head(duration_new[[1]],-1))))),
end_date_new = list(start_date + days(cumsum(duration_new[[1]]))),
segment_id = list(seq_along(duration_new[[1]]))) %>%
unnest %>%
ungroup
# # A tibble: 619 x 9
# sample_id site_id start_date end_date duration duration_new start_date_new end_date_new segment_id
# <int> <int> <dttm> <dttm> <time> <dbl> <dttm> <dttm> <int>
# 1 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-04-12 00:00:00 2015-04-19 00:00:00 1
# 2 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-04-19 00:00:00 2015-04-26 00:00:00 2
# 3 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-04-26 00:00:00 2015-05-03 00:00:00 3
# 4 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-05-03 00:00:00 2015-05-10 00:00:00 4
# 5 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 4 2015-05-10 00:00:00 2015-05-14 00:00:00 5
# 6 8 13 2015-06-10 00:00:00 2015-06-18 00:00:00 8 7 2015-06-10 00:00:00 2015-06-17 00:00:00 1
# 7 8 13 2015-06-10 00:00:00 2015-06-18 00:00:00 8 1 2015-06-17 00:00:00 2015-06-18 00:00:00 2
# 8 25 12 2015-07-02 00:00:00 2015-07-08 00:00:00 6 6 2015-07-02 00:00:00 2015-07-08 00:00:00 1
# 9 41 23 2015-07-22 00:00:00 2015-07-24 00:00:00 2 2 2015-07-22 00:00:00 2015-07-24 00:00:00 1
# 10 59 47 2015-07-29 00:00:00 2015-07-30 00:00:00 1 1 2015-07-29 00:00:00 2015-07-30 00:00:00 1
# # ... with 609 more rows
How the unnesting works
After the mutate
call, the data.frame
has just the same amount of rows as before, but it has new columns. These new columns all contain list elements (elements that are lists), we call them list columns.
Then unnest
develop these lists vertically and we get our additional rows, elements from columns that were not list columns are just repeated.
from ?unnest
: If you have a list-column, this makes each element of the list its own row.
how duration_new
is built
We a simple sequence of breaks using seq
(see ?seq
), seq doesn't give the last element however (which is the complete duration), so we add it with c
. The diff
of breaks gives the individual durations.
You can execute this step by step (select until before a pipe, ctrl+enter):
duration <- 32
(as.numeric(duration)-1) %>% seq(0,.,by=7) %>% c(duration) %>% diff %>% list
how start_date_new
and end_date_new
are built
We start from start_date and add the cumulated durations, for start_date_new
we add 0
to the first element, so we keep them offset.
回答2:
I apologize for not using dplyr
and tidyr
but I am much more fluent in data.table
syntax:
n <- 7L
library(data.table)
setDT(my_durations)[, {
tmp <- unique(c(seq(as.Date(start_date), as.Date(end_date), by = paste(n, "days")),
as.Date(end_date)))
.(segment_id = head(seq_along(tmp), -1L),
segment_start = head(tmp, -1L),
segment_end = tail(tmp, -1L),
segment_duration = diff(tmp))
}, by = .(sample_id, site_id, start_date, end_date, duration)]
sample_id site_id start_date end_date duration segment_id segment_start segment_end segment_duration 1: 2 2 2015-04-12 2015-05-14 32 days 1 2015-04-12 2015-04-19 7 days 2: 2 2 2015-04-12 2015-05-14 32 days 2 2015-04-19 2015-04-26 7 days 3: 2 2 2015-04-12 2015-05-14 32 days 3 2015-04-26 2015-05-03 7 days 4: 2 2 2015-04-12 2015-05-14 32 days 4 2015-05-03 2015-05-10 7 days 5: 2 2 2015-04-12 2015-05-14 32 days 5 2015-05-10 2015-05-14 4 days --- 615: 2838 364 2017-11-26 2018-02-27 93 days 10 2018-01-28 2018-02-04 7 days 616: 2838 364 2017-11-26 2018-02-27 93 days 11 2018-02-04 2018-02-11 7 days 617: 2838 364 2017-11-26 2018-02-27 93 days 12 2018-02-11 2018-02-18 7 days 618: 2838 364 2017-11-26 2018-02-27 93 days 13 2018-02-18 2018-02-25 7 days 619: 2838 364 2017-11-26 2018-02-27 93 days 14 2018-02-25 2018-02-27 2 days
The OP has requested that segments whose duration is shorter than n
should be kept as is with segment_id == 1
. This can be verified for the given dataset with n <- 50L
where above code returns
sample_id site_id start_date end_date duration segment_id segment_start segment_end segment_duration 1: 2 2 2015-04-12 2015-05-14 32 days 1 2015-04-12 2015-05-14 32 days 2: 8 13 2015-06-10 2015-06-18 8 days 1 2015-06-10 2015-06-18 8 days 3: 25 12 2015-07-02 2015-07-08 6 days 1 2015-07-02 2015-07-08 6 days 4: 41 23 2015-07-22 2015-07-24 2 days 1 2015-07-22 2015-07-24 2 days 5: 59 47 2015-07-29 2015-07-30 1 days 1 2015-07-29 2015-07-30 1 days --- 308: 2837 350 2017-08-29 2018-02-27 182 days 2 2017-10-18 2017-12-07 50 days 309: 2837 350 2017-08-29 2018-02-27 182 days 3 2017-12-07 2018-01-26 50 days 310: 2837 350 2017-08-29 2018-02-27 182 days 4 2018-01-26 2018-02-27 32 days 311: 2838 364 2017-11-26 2018-02-27 93 days 1 2017-11-26 2018-01-15 50 days 312: 2838 364 2017-11-26 2018-02-27 93 days 2 2018-01-15 2018-02-27 43 days
回答3:
A potential solution. Notice that I converted the duration
column to integer for the operation. You can convert it back if you want.
library(tidyverse)
library(lubridate)
# Define the number of days for one duration
num <- 7L
dat2 <- dat %>%
mutate(duration = as.integer(duration)) %>%
# Count number of duration and how many days left
mutate(times = duration %/% num, left = duration %% num) %>%
# Repeat each row based on number of duration
slice(rep(row_number(), times = times + 1)) %>%
group_by(sample_id, site_id) %>%
# Process the start and end date
mutate(seg_start_date = if_else(row_number() < n(),
start_date + days(num * (n() - 1)),
end_date - days(left)),
seg_end_date = if_else(row_number() < n(),
seg_start_date + days(num - 1),
end_date)) %>%
# Create segment id
mutate(segment_id = 1:n()) %>%
# Create segment duration
mutate(seg_duration = ifelse(row_number() == n(), left, num)) %>%
ungroup() %>%
select(-times, -left)
head(dat2) %>% as.data.frame()
# sample_id site_id start_date end_date duration seg_start_date seg_end_date segment_id seg_duration
# 1 2 2 2015-04-12 2015-05-14 32 2015-04-12 2015-04-18 1 7
# 2 2 2 2015-04-12 2015-05-14 32 2015-04-19 2015-04-25 2 7
# 3 2 2 2015-04-12 2015-05-14 32 2015-04-26 2015-05-02 3 7
# 4 2 2 2015-04-12 2015-05-14 32 2015-05-03 2015-05-09 4 7
# 5 2 2 2015-04-12 2015-05-14 32 2015-05-10 2015-05-14 5 4
# 6 8 13 2015-06-10 2015-06-18 8 2015-06-10 2015-06-16 1 7
来源:https://stackoverflow.com/questions/51407177/r-lubridate-split-durations-into-sub-durations