问题
I have a R tidy dataset my_durations where each case in the data frame corresponds to a sample taken over a duration of time like so:
> glimpse(my_durations)
Observations: 300
Variables: 5
$ sample_id <int> 2, 8, 25, 41, 59, 70, 98, 100, 105, 106, 108, 114, 119, 126,...
$ site_id <int> 2, 13, 12, 23, 47, 23, 66, 72, 72, 50, 50, 54, 45, 73, 48, 7...
$ start_date <dttm> 2015-04-12, 2015-06-10, 2015-07-02, 2015-07-22, 2015-07-29,...
$ end_date <dttm> 2015-05-14, 2015-06-18, 2015-07-08, 2015-07-24, 2015-07-30,...
$ duration <time> 32 days, 8 days, 6 days, 2 days, 1 days, 4 days, 12 days, 2...
Where sample_id is the unique ID for that sample, site_id is just an ID for keeping track of where the sample was taken, start_date and end_date are when the sampling began and ended, and duration is simply the difference in time between start_date and end_date.
Here is the full dput() for the dataset:
structure(list(sample_id = c(2L, 8L, 25L, 41L, 59L, 70L, 98L,
100L, 105L, 106L, 108L, 114L, 119L, 126L, 128L, 146L, 151L, 164L,
167L, 169L, 175L, 190L, 198L, 200L, 222L, 237L, 254L, 273L, 276L,
280L, 281L, 290L, 300L, 305L, 314L, 345L, 354L, 371L, 376L, 379L,
380L, 382L, 383L, 389L, 401L, 410L, 413L, 424L, 439L, 466L, 469L,
476L, 482L, 484L, 499L, 505L, 517L, 538L, 580L, 582L, 583L, 584L,
635L, 650L, 655L, 658L, 662L, 671L, 674L, 702L, 709L, 710L, 712L,
715L, 716L, 724L, 734L, 735L, 738L, 785L, 789L, 793L, 794L, 803L,
833L, 856L, 859L, 865L, 866L, 888L, 895L, 898L, 900L, 907L, 938L,
979L, 980L, 988L, 991L, 1009L, 1026L, 1031L, 1034L, 1050L, 1058L,
1061L, 1063L, 1066L, 1069L, 1077L, 1081L, 1091L, 1092L, 1099L,
1100L, 1108L, 1115L, 1119L, 1143L, 1149L, 1158L, 1180L, 1190L,
1195L, 1198L, 1207L, 1231L, 1234L, 1236L, 1242L, 1249L, 1250L,
1271L, 1288L, 1294L, 1311L, 1312L, 1313L, 1319L, 1337L, 1341L,
1345L, 1349L, 1360L, 1374L, 1379L, 1389L, 1393L, 1396L, 1401L,
1404L, 1407L, 1422L, 1434L, 1438L, 1448L, 1454L, 1463L, 1473L,
1489L, 1508L, 1514L, 1518L, 1531L, 1551L, 1564L, 1565L, 1571L,
1572L, 1597L, 1602L, 1619L, 1624L, 1629L, 1630L, 1659L, 1661L,
1666L, 1669L, 1672L, 1678L, 1690L, 1697L, 1700L, 1703L, 1707L,
1715L, 1719L, 1725L, 1732L, 1737L, 1739L, 1754L, 1771L, 1788L,
1790L, 1796L, 1799L, 1802L, 1805L, 1813L, 1814L, 1832L, 1839L,
1844L, 1848L, 1873L, 1887L, 1893L, 1900L, 1901L, 1917L, 1920L,
1939L, 1948L, 1954L, 1956L, 1968L, 1971L, 1975L, 1979L, 2008L,
2015L, 2019L, 2021L, 2028L, 2035L, 2048L, 2062L, 2071L, 2072L,
2075L, 2085L, 2090L, 2091L, 2100L, 2106L, 2115L, 2172L, 2178L,
2181L, 2221L, 2225L, 2228L, 2230L, 2231L, 2237L, 2241L, 2265L,
2266L, 2271L, 2282L, 2284L, 2311L, 2319L, 2337L, 2377L, 2405L,
2409L, 2412L, 2429L, 2434L, 2460L, 2483L, 2485L, 2488L, 2490L,
2500L, 2513L, 2520L, 2521L, 2527L, 2539L, 2555L, 2569L, 2599L,
2605L, 2610L, 2635L, 2640L, 2641L, 2656L, 2667L, 2689L, 2705L,
2720L, 2747L, 2753L, 2756L, 2761L, 2769L, 2772L, 2809L, 2816L,
2818L, 2821L, 2823L, 2828L, 2837L, 2838L), site_id = c(2L, 13L,
12L, 23L, 47L, 23L, 66L, 72L, 72L, 50L, 50L, 54L, 45L, 73L, 48L,
73L, 84L, 85L, 85L, 52L, 66L, 73L, 76L, 95L, 61L, 73L, 106L,
72L, 108L, 90L, 91L, 44L, 103L, 90L, 108L, 105L, 122L, 131L,
133L, 133L, 133L, 133L, 133L, 52L, 138L, 136L, 113L, 146L, 55L,
147L, 113L, 151L, 147L, 117L, 74L, 160L, 55L, 73L, 74L, 73L,
151L, 73L, 169L, 168L, 73L, 73L, 44L, 73L, 182L, 74L, 73L, 105L,
160L, 184L, 184L, 74L, 74L, 73L, 113L, 199L, 73L, 202L, 198L,
73L, 199L, 74L, 73L, 74L, 74L, 198L, 213L, 212L, 213L, 44L, 160L,
221L, 218L, 230L, 226L, 201L, 74L, 73L, 230L, 184L, 161L, 74L,
74L, 73L, 214L, 74L, 73L, 73L, 74L, 73L, 74L, 73L, 74L, 74L,
226L, 73L, 74L, 74L, 201L, 201L, 73L, 74L, 242L, 226L, 74L, 74L,
113L, 73L, 249L, 73L, 249L, 74L, 247L, 240L, 73L, 74L, 44L, 73L,
201L, 74L, 74L, 191L, 73L, 254L, 201L, 248L, 237L, 260L, 73L,
226L, 74L, 191L, 226L, 259L, 73L, 226L, 74L, 237L, 74L, 248L,
275L, 276L, 276L, 277L, 277L, 260L, 280L, 280L, 160L, 244L, 262L,
74L, 44L, 74L, 44L, 73L, 74L, 73L, 73L, 74L, 74L, 73L, 74L, 73L,
244L, 74L, 73L, 105L, 74L, 74L, 294L, 73L, 223L, 223L, 248L,
295L, 73L, 74L, 74L, 295L, 73L, 269L, 73L, 201L, 199L, 74L, 74L,
74L, 271L, 292L, 105L, 292L, 199L, 267L, 292L, 305L, 74L, 74L,
295L, 309L, 74L, 310L, 310L, 271L, 316L, 74L, 73L, 305L, 73L,
113L, 74L, 74L, 73L, 191L, 74L, 245L, 226L, 321L, 241L, 320L,
113L, 323L, 73L, 320L, 73L, 74L, 74L, 73L, 73L, 191L, 74L, 73L,
74L, 74L, 74L, 73L, 245L, 113L, 73L, 16L, 73L, 348L, 350L, 245L,
306L, 191L, 245L, 350L, 244L, 348L, 113L, 191L, 306L, 73L, 73L,
306L, 350L, 73L, 361L, 245L, 73L, 114L, 191L, 73L, 357L, 361L,
376L, 364L, 360L, 378L, 357L, 73L, 380L, 73L, 350L, 364L), start_date = structure(c(1428796800,
1433894400, 1435795200, 1437523200, 1438128000, 1438300800, 1437609600,
1438905600, 1438905600, 1438041600, 1438041600, 1438646400, 1438560000,
1439424000, 1438819200, 1440115200, 1439856000, 1440115200, 1440115200,
1438041600, 1440460800, 1441497600, 1440547200, 1441238400, 1438992000,
1442707200, 1443225600, 1439337600, 1440633600, 1442707200, 1442707200,
1444089600, 1442534400, 1444348800, 1443225600, 1444694400, 1445817600,
1445472000, 1446854400, 1446854400, 1446854400, 1446854400, 1446854400,
1441584000, 1447459200, 1447372800, 1444348800, 1447977600, 1448064000,
1448064000, 1446940800, 1449014400, 1448064000, 1445904000, 1449878400,
1449792000, 1449878400, 1451001600, 1452729600, 1452902400, 1452470400,
1452988800, 1453075200, 1454889600, 1455235200, 1455408000, 1454976000,
1455753600, 1453766400, 1456963200, 1457308800, 1456876800, 1456876800,
1456790400, 1457395200, 1457827200, 1458086400, 1458172800, 1455580800,
1460073600, 1460419200, 1456617600, 1460073600, 1460851200, 1460073600,
1462233600, 1462320000, 1462492800, 1462665600, 1460073600, 1462579200,
1462492800, 1462579200, 1462838400, 1463443200, 1463702400, 1463616000,
1464912000, 1464825600, 1465171200, 1466035200, 1466121600, 1464912000,
1461888000, 1464652800, 1466467200, 1466553600, 1466640000, 1462579200,
1466726400, 1466985600, 1467331200, 1467331200, 1467590400, 1467590400,
1467936000, 1468108800, 1468281600, 1468368000, 1469145600, 1469404800,
1470009600, 1470009600, 1470009600, 1470441600, 1470787200, 1471219200,
1470096000, 1470268800, 1470873600, 1467590400, 1471564800, 1471478400,
1472256000, 1471478400, 1472515200, 1471219200, 1472256000, 1472601600,
1472860800, 1472688000, 1473292800, 1472947200, 1473638400, 1474243200,
1474156800, 1475193600, 1475193600, 1474761600, 1475193600, 1471046400,
1475193600, 1476316800, 1473724800, 1476748800, 1476403200, 1476748800,
1477785600, 1478044800, 1477958400, 1479168000, 1478304000, 1479254400,
1473811200, 1477699200, 1476576000, 1476576000, 1477872000, 1478476800,
1475193600, 1477094400, 1477094400, 1479859200, 1479340800, 1475884800,
1480896000, 1480464000, 1480982400, 1480982400, 1481241600, 1481846400,
1482192000, 1482451200, 1482537600, 1482624000, 1482969600, 1483228800,
1483401600, 1481846400, 1483747200, 1483920000, 1483488000, 1484438400,
1484956800, 1485216000, 1485388800, 1473292800, 1478995200, 1485216000,
1485216000, 1485907200, 1485907200, 1486339200, 1485216000, 1486512000,
1485216000, 1487116800, 1487030400, 1485388800, 1487721600, 1487808000,
1488153600, 1487289600, 1485129600, 1488240000, 1485129600, 1485388800,
1480896000, 1485129600, 1488412800, 1489104000, 1490054400, 1485216000,
1490054400, 1490400000, 1490572800, 1490659200, 1489622400, 1489881600,
1491436800, 1491523200, 1488412800, 1491782400, 1488758400, 1491868800,
1492473600, 1492646400, 1492387200, 1494633600, 1494288000, 1494288000,
1495152000, 1494201600, 1494979200, 1491868800, 1495065600, 1496102400,
1494979200, 1497052800, 1497052800, 1497225600, 1497657600, 1497744000,
1498435200, 1499212800, 1499904000, 1501372800, 1502409600, 1502582400,
1502668800, 1502236800, 1501718400, 1504569600, 1502841600, 1505174400,
1503878400, 1503964800, 1505260800, 1503964800, 1505606400, 1505865600,
1503964800, 1504656000, 1503878400, 1505520000, 1508716800, 1503964800,
1509580800, 1510704000, 1503964800, 1503964800, 1511481600, 1508889600,
1512518400, 1513987200, 1513555200, 1514764800, 1516665600, 1515456000,
1508889600, 1517097600, 1511654400, 1510012800, 1518393600, 1515456000,
1519257600, 1518825600, 1519344000, 1503964800, 1511654400), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), end_date = structure(c(1431561600,
1434585600, 1436313600, 1437696000, 1438214400, 1438646400, 1438646400,
1439078400, 1439078400, 1438128000, 1438128000, 1439164800, 1438905600,
1439942400, 1438992000, 1440288000, 1440460800, 1440720000, 1440720000,
1438128000, 1440547200, 1441584000, 1441238400, 1441670400, 1439769600,
1443052800, 1444003200, 1439769600, 1441324800, 1444348800, 1444348800,
1444694400, 1444521600, 1444867200, 1445126400, 1445212800, 1446336000,
1445558400, 1447113600, 1447286400, 1447372800, 1447545600, 1447632000,
1442707200, 1447545600, 1448236800, 1444608000, 1448236800, 1449014400,
1449273600, 1449273600, 1449532800, 1449446400, 1445990400, 1450051200,
1450137600, 1450396800, 1451174400, 1452902400, 1452988800, 1453075200,
1453075200, 1454716800, 1455148800, 1455321600, 1455494400, 1455580800,
1455840000, 1454976000, 1457049600, 1457395200, 1457395200, 1457395200,
1457395200, 1457481600, 1457913600, 1458172800, 1458345600, 1458518400,
1460419200, 1460592000, 1457222400, 1460592000, 1460937600, 1461801600,
1462320000, 1462406400, 1462665600, 1462752000, 1462924800, 1462924800,
1463097600, 1463270400, 1463443200, 1464134400, 1464912000, 1464912000,
1465084800, 1465257600, 1465776000, 1466121600, 1466208000, 1466208000,
1463702400, 1466553600, 1466553600, 1466640000, 1466726400, 1466726400,
1466812800, 1467072000, 1467417600, 1467417600, 1467676800, 1467676800,
1468022400, 1468195200, 1468368000, 1468972800, 1469232000, 1469491200,
1470096000, 1470614400, 1470614400, 1470528000, 1470873600, 1471305600,
1471305600, 1470355200, 1471219200, 1471564800, 1471651200, 1471910400,
1472342400, 1472083200, 1472601600, 1472601600, 1472601600, 1472688000,
1473120000, 1473206400, 1473379200, 1473638400, 1473811200, 1474502400,
1474761600, 1475280000, 1475366400, 1475452800, 1475712000, 1472515200,
1475884800, 1476403200, 1475625600, 1476835200, 1477180800, 1477440000,
1477872000, 1478131200, 1478649600, 1479254400, 1479513600, 1479600000,
1476316800, 1478476800, 1477526400, 1477699200, 1478304000, 1478649600,
1476576000, 1477699200, 1480204800, 1480464000, 1480636800, 1476403200,
1480982400, 1480982400, 1481241600, 1481587200, 1481673600, 1481932800,
1482278400, 1482537600, 1482624000, 1482796800, 1483056000, 1483315200,
1483488000, 1483660800, 1483833600, 1484006400, 1484006400, 1484524800,
1485043200, 1485388800, 1485475200, 1474761600, 1480809600, 1485734400,
1485734400, 1485993600, 1485993600, 1486425600, 1486425600, 1486598400,
1486684800, 1487203200, 1487462400, 1487635200, 1487808000, 1487894400,
1488240000, 1488240000, 1488672000, 1488844800, 1488931200, 1488931200,
1485648000, 1489449600, 1489449600, 1489536000, 1490140800, 1490313600,
1490400000, 1490572800, 1490659200, 1490745600, 1491004800, 1489968000,
1491523200, 1491609600, 1491609600, 1491868800, 1491868800, 1492128000,
1492560000, 1492732800, 1492992000, 1494720000, 1494892800, 1494979200,
1495670400, 1495843200, 1495670400, 1495929600, 1495324800, 1496188800,
1496188800, 1497139200, 1497139200, 1497312000, 1497744000, 1497830400,
1498953600, 1499299200, 1499990400, 1501459200, 1502496000, 1502668800,
1502755200, 1503446400, 1503446400, 1504656000, 1503100800, 1505260800,
1505260800, 1505260800, 1505865600, 1506211200, 1506297600, 1506470400,
1506470400, 1507075200, 1507248000, 1508025600, 1509235200, 1509321600,
1509753600, 1510790400, 1510790400, 1510790400, 1511568000, 1511913600,
1513123200, 1514160000, 1514764800, 1516320000, 1516752000, 1516838400,
1516838400, 1517184000, 1517270400, 1518480000, 1518998400, 1519171200,
1519344000, 1519344000, 1519430400, 1519689600, 1519689600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), duration = structure(c(32, 8, 6, 2,
1, 4, 12, 2, 2, 1, 1, 6, 4, 6, 2, 2, 7, 7, 7, 1, 1, 1, 8, 5,
9, 4, 9, 5, 8, 19, 19, 7, 23, 6, 22, 6, 6, 1, 3, 5, 6, 8, 9,
13, 1, 10, 3, 3, 11, 14, 27, 6, 16, 1, 2, 4, 6, 2, 2, 1, 7, 1,
19, 3, 1, 1, 7, 1, 14, 1, 1, 6, 6, 7, 1, 1, 1, 2, 34, 4, 2, 7,
6, 1, 20, 1, 1, 2, 1, 33, 4, 7, 8, 7, 8, 14, 15, 2, 5, 7, 1,
1, 15, 21, 22, 1, 1, 1, 48, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1,
1, 1, 7, 7, 1, 1, 1, 14, 1, 4, 46, 1, 5, 1, 7, 1, 16, 4, 1, 3,
6, 1, 8, 2, 3, 7, 1, 2, 8, 6, 17, 8, 1, 22, 1, 9, 8, 1, 1, 8,
1, 14, 4, 29, 9, 11, 13, 5, 2, 16, 7, 36, 7, 15, 6, 1, 6, 3,
7, 5, 1, 1, 1, 1, 2, 1, 1, 1, 21, 1, 1, 6, 1, 1, 2, 1, 17, 21,
6, 6, 1, 1, 1, 14, 1, 17, 1, 5, 26, 1, 1, 1, 11, 41, 7, 44, 41,
55, 50, 12, 5, 1, 59, 4, 2, 1, 1, 16, 1, 1, 1, 37, 1, 36, 3,
1, 1, 7, 1, 7, 8, 6, 19, 8, 47, 3, 1, 14, 1, 1, 1, 1, 1, 6, 1,
1, 1, 1, 1, 1, 14, 20, 1, 3, 1, 16, 15, 7, 26, 8, 7, 29, 28,
39, 29, 6, 62, 2, 1, 79, 79, 1, 35, 7, 2, 14, 18, 1, 16, 92,
1, 65, 98, 7, 43, 1, 6, 1, 182, 93), class = "difftime", units = "days")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -300L))
The challenge I am facing now is that for any duration that's more than n days (let's go with n == 7 for now), I want to "split" that duration into n-day long segments. For example, for sample_id == 2, the duration is 32 days, so I want to split that into four 7-day segments, plus one 4-day segment. In the end, the original row for sample_id == 2 would turn into five rows each with start_date, end_date, and duration that correspond to each of the five segments. I would like to have a new column called segment_id to identify each of the newly-created segments while keeping all of the original columns. BTW, for the sample_ids that have original durations shorter than n, I want to keep them as is, and they would get segment_id == 1.
I am stumped. Is there a fairly "tidy" way to achieve this? Thank you very much in advance.
回答1:
We can create columns of lists and unnest them:
library(tidyverse)
library(lubridate)
df %>% group_by(sample_id,site_id) %>%
mutate(duration_new = (as.numeric(duration)-1) %>% seq(0,.,by=7) %>% c(duration) %>% diff %>% list,
start_date_new = list(start_date + days(c(0,cumsum(head(duration_new[[1]],-1))))),
end_date_new = list(start_date + days(cumsum(duration_new[[1]]))),
segment_id = list(seq_along(duration_new[[1]]))) %>%
unnest %>%
ungroup
# # A tibble: 619 x 9
# sample_id site_id start_date end_date duration duration_new start_date_new end_date_new segment_id
# <int> <int> <dttm> <dttm> <time> <dbl> <dttm> <dttm> <int>
# 1 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-04-12 00:00:00 2015-04-19 00:00:00 1
# 2 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-04-19 00:00:00 2015-04-26 00:00:00 2
# 3 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-04-26 00:00:00 2015-05-03 00:00:00 3
# 4 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 7 2015-05-03 00:00:00 2015-05-10 00:00:00 4
# 5 2 2 2015-04-12 00:00:00 2015-05-14 00:00:00 32 4 2015-05-10 00:00:00 2015-05-14 00:00:00 5
# 6 8 13 2015-06-10 00:00:00 2015-06-18 00:00:00 8 7 2015-06-10 00:00:00 2015-06-17 00:00:00 1
# 7 8 13 2015-06-10 00:00:00 2015-06-18 00:00:00 8 1 2015-06-17 00:00:00 2015-06-18 00:00:00 2
# 8 25 12 2015-07-02 00:00:00 2015-07-08 00:00:00 6 6 2015-07-02 00:00:00 2015-07-08 00:00:00 1
# 9 41 23 2015-07-22 00:00:00 2015-07-24 00:00:00 2 2 2015-07-22 00:00:00 2015-07-24 00:00:00 1
# 10 59 47 2015-07-29 00:00:00 2015-07-30 00:00:00 1 1 2015-07-29 00:00:00 2015-07-30 00:00:00 1
# # ... with 609 more rows
How the unnesting works
After the mutate call, the data.frame has just the same amount of rows as before, but it has new columns. These new columns all contain list elements (elements that are lists), we call them list columns.
Then unnest develop these lists vertically and we get our additional rows, elements from columns that were not list columns are just repeated.
from ?unnest : If you have a list-column, this makes each element of the list its own row.
how duration_new is built
We a simple sequence of breaks using seq (see ?seq), seq doesn't give the last element however (which is the complete duration), so we add it with c. The diff of breaks gives the individual durations.
You can execute this step by step (select until before a pipe, ctrl+enter):
duration <- 32
(as.numeric(duration)-1) %>% seq(0,.,by=7) %>% c(duration) %>% diff %>% list
how start_date_new and end_date_new are built
We start from start_date and add the cumulated durations, for start_date_new we add 0 to the first element, so we keep them offset.
回答2:
I apologize for not using dplyr and tidyr but I am much more fluent in data.table syntax:
n <- 7L
library(data.table)
setDT(my_durations)[, {
tmp <- unique(c(seq(as.Date(start_date), as.Date(end_date), by = paste(n, "days")),
as.Date(end_date)))
.(segment_id = head(seq_along(tmp), -1L),
segment_start = head(tmp, -1L),
segment_end = tail(tmp, -1L),
segment_duration = diff(tmp))
}, by = .(sample_id, site_id, start_date, end_date, duration)]
sample_id site_id start_date end_date duration segment_id segment_start segment_end segment_duration 1: 2 2 2015-04-12 2015-05-14 32 days 1 2015-04-12 2015-04-19 7 days 2: 2 2 2015-04-12 2015-05-14 32 days 2 2015-04-19 2015-04-26 7 days 3: 2 2 2015-04-12 2015-05-14 32 days 3 2015-04-26 2015-05-03 7 days 4: 2 2 2015-04-12 2015-05-14 32 days 4 2015-05-03 2015-05-10 7 days 5: 2 2 2015-04-12 2015-05-14 32 days 5 2015-05-10 2015-05-14 4 days --- 615: 2838 364 2017-11-26 2018-02-27 93 days 10 2018-01-28 2018-02-04 7 days 616: 2838 364 2017-11-26 2018-02-27 93 days 11 2018-02-04 2018-02-11 7 days 617: 2838 364 2017-11-26 2018-02-27 93 days 12 2018-02-11 2018-02-18 7 days 618: 2838 364 2017-11-26 2018-02-27 93 days 13 2018-02-18 2018-02-25 7 days 619: 2838 364 2017-11-26 2018-02-27 93 days 14 2018-02-25 2018-02-27 2 days
The OP has requested that segments whose duration is shorter than n should be kept as is with segment_id == 1. This can be verified for the given dataset with n <- 50L where above code returns
sample_id site_id start_date end_date duration segment_id segment_start segment_end segment_duration 1: 2 2 2015-04-12 2015-05-14 32 days 1 2015-04-12 2015-05-14 32 days 2: 8 13 2015-06-10 2015-06-18 8 days 1 2015-06-10 2015-06-18 8 days 3: 25 12 2015-07-02 2015-07-08 6 days 1 2015-07-02 2015-07-08 6 days 4: 41 23 2015-07-22 2015-07-24 2 days 1 2015-07-22 2015-07-24 2 days 5: 59 47 2015-07-29 2015-07-30 1 days 1 2015-07-29 2015-07-30 1 days --- 308: 2837 350 2017-08-29 2018-02-27 182 days 2 2017-10-18 2017-12-07 50 days 309: 2837 350 2017-08-29 2018-02-27 182 days 3 2017-12-07 2018-01-26 50 days 310: 2837 350 2017-08-29 2018-02-27 182 days 4 2018-01-26 2018-02-27 32 days 311: 2838 364 2017-11-26 2018-02-27 93 days 1 2017-11-26 2018-01-15 50 days 312: 2838 364 2017-11-26 2018-02-27 93 days 2 2018-01-15 2018-02-27 43 days
回答3:
A potential solution. Notice that I converted the duration column to integer for the operation. You can convert it back if you want.
library(tidyverse)
library(lubridate)
# Define the number of days for one duration
num <- 7L
dat2 <- dat %>%
mutate(duration = as.integer(duration)) %>%
# Count number of duration and how many days left
mutate(times = duration %/% num, left = duration %% num) %>%
# Repeat each row based on number of duration
slice(rep(row_number(), times = times + 1)) %>%
group_by(sample_id, site_id) %>%
# Process the start and end date
mutate(seg_start_date = if_else(row_number() < n(),
start_date + days(num * (n() - 1)),
end_date - days(left)),
seg_end_date = if_else(row_number() < n(),
seg_start_date + days(num - 1),
end_date)) %>%
# Create segment id
mutate(segment_id = 1:n()) %>%
# Create segment duration
mutate(seg_duration = ifelse(row_number() == n(), left, num)) %>%
ungroup() %>%
select(-times, -left)
head(dat2) %>% as.data.frame()
# sample_id site_id start_date end_date duration seg_start_date seg_end_date segment_id seg_duration
# 1 2 2 2015-04-12 2015-05-14 32 2015-04-12 2015-04-18 1 7
# 2 2 2 2015-04-12 2015-05-14 32 2015-04-19 2015-04-25 2 7
# 3 2 2 2015-04-12 2015-05-14 32 2015-04-26 2015-05-02 3 7
# 4 2 2 2015-04-12 2015-05-14 32 2015-05-03 2015-05-09 4 7
# 5 2 2 2015-04-12 2015-05-14 32 2015-05-10 2015-05-14 5 4
# 6 8 13 2015-06-10 2015-06-18 8 2015-06-10 2015-06-16 1 7
来源:https://stackoverflow.com/questions/51407177/r-lubridate-split-durations-into-sub-durations