R - lubridate: split durations into “sub-durations”

生来就可爱ヽ(ⅴ<●) 提交于 2019-12-08 05:47:23

问题


I have a R tidy dataset my_durations where each case in the data frame corresponds to a sample taken over a duration of time like so:

> glimpse(my_durations)
Observations: 300
Variables: 5
$ sample_id  <int> 2, 8, 25, 41, 59, 70, 98, 100, 105, 106, 108, 114, 119, 126,...
$ site_id    <int> 2, 13, 12, 23, 47, 23, 66, 72, 72, 50, 50, 54, 45, 73, 48, 7...
$ start_date <dttm> 2015-04-12, 2015-06-10, 2015-07-02, 2015-07-22, 2015-07-29,...
$ end_date   <dttm> 2015-05-14, 2015-06-18, 2015-07-08, 2015-07-24, 2015-07-30,...
$ duration   <time> 32 days, 8 days, 6 days, 2 days, 1 days, 4 days, 12 days, 2...

Where sample_id is the unique ID for that sample, site_id is just an ID for keeping track of where the sample was taken, start_date and end_date are when the sampling began and ended, and duration is simply the difference in time between start_date and end_date.

Here is the full dput() for the dataset:

structure(list(sample_id = c(2L, 8L, 25L, 41L, 59L, 70L, 98L, 
100L, 105L, 106L, 108L, 114L, 119L, 126L, 128L, 146L, 151L, 164L, 
167L, 169L, 175L, 190L, 198L, 200L, 222L, 237L, 254L, 273L, 276L, 
280L, 281L, 290L, 300L, 305L, 314L, 345L, 354L, 371L, 376L, 379L, 
380L, 382L, 383L, 389L, 401L, 410L, 413L, 424L, 439L, 466L, 469L, 
476L, 482L, 484L, 499L, 505L, 517L, 538L, 580L, 582L, 583L, 584L, 
635L, 650L, 655L, 658L, 662L, 671L, 674L, 702L, 709L, 710L, 712L, 
715L, 716L, 724L, 734L, 735L, 738L, 785L, 789L, 793L, 794L, 803L, 
833L, 856L, 859L, 865L, 866L, 888L, 895L, 898L, 900L, 907L, 938L, 
979L, 980L, 988L, 991L, 1009L, 1026L, 1031L, 1034L, 1050L, 1058L, 
1061L, 1063L, 1066L, 1069L, 1077L, 1081L, 1091L, 1092L, 1099L, 
1100L, 1108L, 1115L, 1119L, 1143L, 1149L, 1158L, 1180L, 1190L, 
1195L, 1198L, 1207L, 1231L, 1234L, 1236L, 1242L, 1249L, 1250L, 
1271L, 1288L, 1294L, 1311L, 1312L, 1313L, 1319L, 1337L, 1341L, 
1345L, 1349L, 1360L, 1374L, 1379L, 1389L, 1393L, 1396L, 1401L, 
1404L, 1407L, 1422L, 1434L, 1438L, 1448L, 1454L, 1463L, 1473L, 
1489L, 1508L, 1514L, 1518L, 1531L, 1551L, 1564L, 1565L, 1571L, 
1572L, 1597L, 1602L, 1619L, 1624L, 1629L, 1630L, 1659L, 1661L, 
1666L, 1669L, 1672L, 1678L, 1690L, 1697L, 1700L, 1703L, 1707L, 
1715L, 1719L, 1725L, 1732L, 1737L, 1739L, 1754L, 1771L, 1788L, 
1790L, 1796L, 1799L, 1802L, 1805L, 1813L, 1814L, 1832L, 1839L, 
1844L, 1848L, 1873L, 1887L, 1893L, 1900L, 1901L, 1917L, 1920L, 
1939L, 1948L, 1954L, 1956L, 1968L, 1971L, 1975L, 1979L, 2008L, 
2015L, 2019L, 2021L, 2028L, 2035L, 2048L, 2062L, 2071L, 2072L, 
2075L, 2085L, 2090L, 2091L, 2100L, 2106L, 2115L, 2172L, 2178L, 
2181L, 2221L, 2225L, 2228L, 2230L, 2231L, 2237L, 2241L, 2265L, 
2266L, 2271L, 2282L, 2284L, 2311L, 2319L, 2337L, 2377L, 2405L, 
2409L, 2412L, 2429L, 2434L, 2460L, 2483L, 2485L, 2488L, 2490L, 
2500L, 2513L, 2520L, 2521L, 2527L, 2539L, 2555L, 2569L, 2599L, 
2605L, 2610L, 2635L, 2640L, 2641L, 2656L, 2667L, 2689L, 2705L, 
2720L, 2747L, 2753L, 2756L, 2761L, 2769L, 2772L, 2809L, 2816L, 
2818L, 2821L, 2823L, 2828L, 2837L, 2838L), site_id = c(2L, 13L, 
12L, 23L, 47L, 23L, 66L, 72L, 72L, 50L, 50L, 54L, 45L, 73L, 48L, 
73L, 84L, 85L, 85L, 52L, 66L, 73L, 76L, 95L, 61L, 73L, 106L, 
72L, 108L, 90L, 91L, 44L, 103L, 90L, 108L, 105L, 122L, 131L, 
133L, 133L, 133L, 133L, 133L, 52L, 138L, 136L, 113L, 146L, 55L, 
147L, 113L, 151L, 147L, 117L, 74L, 160L, 55L, 73L, 74L, 73L, 
151L, 73L, 169L, 168L, 73L, 73L, 44L, 73L, 182L, 74L, 73L, 105L, 
160L, 184L, 184L, 74L, 74L, 73L, 113L, 199L, 73L, 202L, 198L, 
73L, 199L, 74L, 73L, 74L, 74L, 198L, 213L, 212L, 213L, 44L, 160L, 
221L, 218L, 230L, 226L, 201L, 74L, 73L, 230L, 184L, 161L, 74L, 
74L, 73L, 214L, 74L, 73L, 73L, 74L, 73L, 74L, 73L, 74L, 74L, 
226L, 73L, 74L, 74L, 201L, 201L, 73L, 74L, 242L, 226L, 74L, 74L, 
113L, 73L, 249L, 73L, 249L, 74L, 247L, 240L, 73L, 74L, 44L, 73L, 
201L, 74L, 74L, 191L, 73L, 254L, 201L, 248L, 237L, 260L, 73L, 
226L, 74L, 191L, 226L, 259L, 73L, 226L, 74L, 237L, 74L, 248L, 
275L, 276L, 276L, 277L, 277L, 260L, 280L, 280L, 160L, 244L, 262L, 
74L, 44L, 74L, 44L, 73L, 74L, 73L, 73L, 74L, 74L, 73L, 74L, 73L, 
244L, 74L, 73L, 105L, 74L, 74L, 294L, 73L, 223L, 223L, 248L, 
295L, 73L, 74L, 74L, 295L, 73L, 269L, 73L, 201L, 199L, 74L, 74L, 
74L, 271L, 292L, 105L, 292L, 199L, 267L, 292L, 305L, 74L, 74L, 
295L, 309L, 74L, 310L, 310L, 271L, 316L, 74L, 73L, 305L, 73L, 
113L, 74L, 74L, 73L, 191L, 74L, 245L, 226L, 321L, 241L, 320L, 
113L, 323L, 73L, 320L, 73L, 74L, 74L, 73L, 73L, 191L, 74L, 73L, 
74L, 74L, 74L, 73L, 245L, 113L, 73L, 16L, 73L, 348L, 350L, 245L, 
306L, 191L, 245L, 350L, 244L, 348L, 113L, 191L, 306L, 73L, 73L, 
306L, 350L, 73L, 361L, 245L, 73L, 114L, 191L, 73L, 357L, 361L, 
376L, 364L, 360L, 378L, 357L, 73L, 380L, 73L, 350L, 364L), start_date = structure(c(1428796800, 
1433894400, 1435795200, 1437523200, 1438128000, 1438300800, 1437609600, 
1438905600, 1438905600, 1438041600, 1438041600, 1438646400, 1438560000, 
1439424000, 1438819200, 1440115200, 1439856000, 1440115200, 1440115200, 
1438041600, 1440460800, 1441497600, 1440547200, 1441238400, 1438992000, 
1442707200, 1443225600, 1439337600, 1440633600, 1442707200, 1442707200, 
1444089600, 1442534400, 1444348800, 1443225600, 1444694400, 1445817600, 
1445472000, 1446854400, 1446854400, 1446854400, 1446854400, 1446854400, 
1441584000, 1447459200, 1447372800, 1444348800, 1447977600, 1448064000, 
1448064000, 1446940800, 1449014400, 1448064000, 1445904000, 1449878400, 
1449792000, 1449878400, 1451001600, 1452729600, 1452902400, 1452470400, 
1452988800, 1453075200, 1454889600, 1455235200, 1455408000, 1454976000, 
1455753600, 1453766400, 1456963200, 1457308800, 1456876800, 1456876800, 
1456790400, 1457395200, 1457827200, 1458086400, 1458172800, 1455580800, 
1460073600, 1460419200, 1456617600, 1460073600, 1460851200, 1460073600, 
1462233600, 1462320000, 1462492800, 1462665600, 1460073600, 1462579200, 
1462492800, 1462579200, 1462838400, 1463443200, 1463702400, 1463616000, 
1464912000, 1464825600, 1465171200, 1466035200, 1466121600, 1464912000, 
1461888000, 1464652800, 1466467200, 1466553600, 1466640000, 1462579200, 
1466726400, 1466985600, 1467331200, 1467331200, 1467590400, 1467590400, 
1467936000, 1468108800, 1468281600, 1468368000, 1469145600, 1469404800, 
1470009600, 1470009600, 1470009600, 1470441600, 1470787200, 1471219200, 
1470096000, 1470268800, 1470873600, 1467590400, 1471564800, 1471478400, 
1472256000, 1471478400, 1472515200, 1471219200, 1472256000, 1472601600, 
1472860800, 1472688000, 1473292800, 1472947200, 1473638400, 1474243200, 
1474156800, 1475193600, 1475193600, 1474761600, 1475193600, 1471046400, 
1475193600, 1476316800, 1473724800, 1476748800, 1476403200, 1476748800, 
1477785600, 1478044800, 1477958400, 1479168000, 1478304000, 1479254400, 
1473811200, 1477699200, 1476576000, 1476576000, 1477872000, 1478476800, 
1475193600, 1477094400, 1477094400, 1479859200, 1479340800, 1475884800, 
1480896000, 1480464000, 1480982400, 1480982400, 1481241600, 1481846400, 
1482192000, 1482451200, 1482537600, 1482624000, 1482969600, 1483228800, 
1483401600, 1481846400, 1483747200, 1483920000, 1483488000, 1484438400, 
1484956800, 1485216000, 1485388800, 1473292800, 1478995200, 1485216000, 
1485216000, 1485907200, 1485907200, 1486339200, 1485216000, 1486512000, 
1485216000, 1487116800, 1487030400, 1485388800, 1487721600, 1487808000, 
1488153600, 1487289600, 1485129600, 1488240000, 1485129600, 1485388800, 
1480896000, 1485129600, 1488412800, 1489104000, 1490054400, 1485216000, 
1490054400, 1490400000, 1490572800, 1490659200, 1489622400, 1489881600, 
1491436800, 1491523200, 1488412800, 1491782400, 1488758400, 1491868800, 
1492473600, 1492646400, 1492387200, 1494633600, 1494288000, 1494288000, 
1495152000, 1494201600, 1494979200, 1491868800, 1495065600, 1496102400, 
1494979200, 1497052800, 1497052800, 1497225600, 1497657600, 1497744000, 
1498435200, 1499212800, 1499904000, 1501372800, 1502409600, 1502582400, 
1502668800, 1502236800, 1501718400, 1504569600, 1502841600, 1505174400, 
1503878400, 1503964800, 1505260800, 1503964800, 1505606400, 1505865600, 
1503964800, 1504656000, 1503878400, 1505520000, 1508716800, 1503964800, 
1509580800, 1510704000, 1503964800, 1503964800, 1511481600, 1508889600, 
1512518400, 1513987200, 1513555200, 1514764800, 1516665600, 1515456000, 
1508889600, 1517097600, 1511654400, 1510012800, 1518393600, 1515456000, 
1519257600, 1518825600, 1519344000, 1503964800, 1511654400), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), end_date = structure(c(1431561600, 
1434585600, 1436313600, 1437696000, 1438214400, 1438646400, 1438646400, 
1439078400, 1439078400, 1438128000, 1438128000, 1439164800, 1438905600, 
1439942400, 1438992000, 1440288000, 1440460800, 1440720000, 1440720000, 
1438128000, 1440547200, 1441584000, 1441238400, 1441670400, 1439769600, 
1443052800, 1444003200, 1439769600, 1441324800, 1444348800, 1444348800, 
1444694400, 1444521600, 1444867200, 1445126400, 1445212800, 1446336000, 
1445558400, 1447113600, 1447286400, 1447372800, 1447545600, 1447632000, 
1442707200, 1447545600, 1448236800, 1444608000, 1448236800, 1449014400, 
1449273600, 1449273600, 1449532800, 1449446400, 1445990400, 1450051200, 
1450137600, 1450396800, 1451174400, 1452902400, 1452988800, 1453075200, 
1453075200, 1454716800, 1455148800, 1455321600, 1455494400, 1455580800, 
1455840000, 1454976000, 1457049600, 1457395200, 1457395200, 1457395200, 
1457395200, 1457481600, 1457913600, 1458172800, 1458345600, 1458518400, 
1460419200, 1460592000, 1457222400, 1460592000, 1460937600, 1461801600, 
1462320000, 1462406400, 1462665600, 1462752000, 1462924800, 1462924800, 
1463097600, 1463270400, 1463443200, 1464134400, 1464912000, 1464912000, 
1465084800, 1465257600, 1465776000, 1466121600, 1466208000, 1466208000, 
1463702400, 1466553600, 1466553600, 1466640000, 1466726400, 1466726400, 
1466812800, 1467072000, 1467417600, 1467417600, 1467676800, 1467676800, 
1468022400, 1468195200, 1468368000, 1468972800, 1469232000, 1469491200, 
1470096000, 1470614400, 1470614400, 1470528000, 1470873600, 1471305600, 
1471305600, 1470355200, 1471219200, 1471564800, 1471651200, 1471910400, 
1472342400, 1472083200, 1472601600, 1472601600, 1472601600, 1472688000, 
1473120000, 1473206400, 1473379200, 1473638400, 1473811200, 1474502400, 
1474761600, 1475280000, 1475366400, 1475452800, 1475712000, 1472515200, 
1475884800, 1476403200, 1475625600, 1476835200, 1477180800, 1477440000, 
1477872000, 1478131200, 1478649600, 1479254400, 1479513600, 1479600000, 
1476316800, 1478476800, 1477526400, 1477699200, 1478304000, 1478649600, 
1476576000, 1477699200, 1480204800, 1480464000, 1480636800, 1476403200, 
1480982400, 1480982400, 1481241600, 1481587200, 1481673600, 1481932800, 
1482278400, 1482537600, 1482624000, 1482796800, 1483056000, 1483315200, 
1483488000, 1483660800, 1483833600, 1484006400, 1484006400, 1484524800, 
1485043200, 1485388800, 1485475200, 1474761600, 1480809600, 1485734400, 
1485734400, 1485993600, 1485993600, 1486425600, 1486425600, 1486598400, 
1486684800, 1487203200, 1487462400, 1487635200, 1487808000, 1487894400, 
1488240000, 1488240000, 1488672000, 1488844800, 1488931200, 1488931200, 
1485648000, 1489449600, 1489449600, 1489536000, 1490140800, 1490313600, 
1490400000, 1490572800, 1490659200, 1490745600, 1491004800, 1489968000, 
1491523200, 1491609600, 1491609600, 1491868800, 1491868800, 1492128000, 
1492560000, 1492732800, 1492992000, 1494720000, 1494892800, 1494979200, 
1495670400, 1495843200, 1495670400, 1495929600, 1495324800, 1496188800, 
1496188800, 1497139200, 1497139200, 1497312000, 1497744000, 1497830400, 
1498953600, 1499299200, 1499990400, 1501459200, 1502496000, 1502668800, 
1502755200, 1503446400, 1503446400, 1504656000, 1503100800, 1505260800, 
1505260800, 1505260800, 1505865600, 1506211200, 1506297600, 1506470400, 
1506470400, 1507075200, 1507248000, 1508025600, 1509235200, 1509321600, 
1509753600, 1510790400, 1510790400, 1510790400, 1511568000, 1511913600, 
1513123200, 1514160000, 1514764800, 1516320000, 1516752000, 1516838400, 
1516838400, 1517184000, 1517270400, 1518480000, 1518998400, 1519171200, 
1519344000, 1519344000, 1519430400, 1519689600, 1519689600), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), duration = structure(c(32, 8, 6, 2, 
1, 4, 12, 2, 2, 1, 1, 6, 4, 6, 2, 2, 7, 7, 7, 1, 1, 1, 8, 5, 
9, 4, 9, 5, 8, 19, 19, 7, 23, 6, 22, 6, 6, 1, 3, 5, 6, 8, 9, 
13, 1, 10, 3, 3, 11, 14, 27, 6, 16, 1, 2, 4, 6, 2, 2, 1, 7, 1, 
19, 3, 1, 1, 7, 1, 14, 1, 1, 6, 6, 7, 1, 1, 1, 2, 34, 4, 2, 7, 
6, 1, 20, 1, 1, 2, 1, 33, 4, 7, 8, 7, 8, 14, 15, 2, 5, 7, 1, 
1, 15, 21, 22, 1, 1, 1, 48, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 
1, 1, 7, 7, 1, 1, 1, 14, 1, 4, 46, 1, 5, 1, 7, 1, 16, 4, 1, 3, 
6, 1, 8, 2, 3, 7, 1, 2, 8, 6, 17, 8, 1, 22, 1, 9, 8, 1, 1, 8, 
1, 14, 4, 29, 9, 11, 13, 5, 2, 16, 7, 36, 7, 15, 6, 1, 6, 3, 
7, 5, 1, 1, 1, 1, 2, 1, 1, 1, 21, 1, 1, 6, 1, 1, 2, 1, 17, 21, 
6, 6, 1, 1, 1, 14, 1, 17, 1, 5, 26, 1, 1, 1, 11, 41, 7, 44, 41, 
55, 50, 12, 5, 1, 59, 4, 2, 1, 1, 16, 1, 1, 1, 37, 1, 36, 3, 
1, 1, 7, 1, 7, 8, 6, 19, 8, 47, 3, 1, 14, 1, 1, 1, 1, 1, 6, 1, 
1, 1, 1, 1, 1, 14, 20, 1, 3, 1, 16, 15, 7, 26, 8, 7, 29, 28, 
39, 29, 6, 62, 2, 1, 79, 79, 1, 35, 7, 2, 14, 18, 1, 16, 92, 
1, 65, 98, 7, 43, 1, 6, 1, 182, 93), class = "difftime", units = "days")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -300L))

The challenge I am facing now is that for any duration that's more than n days (let's go with n == 7 for now), I want to "split" that duration into n-day long segments. For example, for sample_id == 2, the duration is 32 days, so I want to split that into four 7-day segments, plus one 4-day segment. In the end, the original row for sample_id == 2 would turn into five rows each with start_date, end_date, and duration that correspond to each of the five segments. I would like to have a new column called segment_id to identify each of the newly-created segments while keeping all of the original columns. BTW, for the sample_ids that have original durations shorter than n, I want to keep them as is, and they would get segment_id == 1.

I am stumped. Is there a fairly "tidy" way to achieve this? Thank you very much in advance.


回答1:


We can create columns of lists and unnest them:

library(tidyverse)
library(lubridate)
df %>% group_by(sample_id,site_id) %>% 
  mutate(duration_new = (as.numeric(duration)-1) %>% seq(0,.,by=7) %>% c(duration) %>% diff %>% list,
         start_date_new = list(start_date + days(c(0,cumsum(head(duration_new[[1]],-1))))),
         end_date_new = list(start_date + days(cumsum(duration_new[[1]]))),
         segment_id = list(seq_along(duration_new[[1]]))) %>% 
  unnest %>%
  ungroup
# # A tibble: 619 x 9
#    sample_id site_id start_date          end_date            duration duration_new start_date_new      end_date_new        segment_id
#        <int>   <int> <dttm>              <dttm>              <time>          <dbl> <dttm>              <dttm>                   <int>
#  1         2       2 2015-04-12 00:00:00 2015-05-14 00:00:00 32                  7 2015-04-12 00:00:00 2015-04-19 00:00:00          1
#  2         2       2 2015-04-12 00:00:00 2015-05-14 00:00:00 32                  7 2015-04-19 00:00:00 2015-04-26 00:00:00          2
#  3         2       2 2015-04-12 00:00:00 2015-05-14 00:00:00 32                  7 2015-04-26 00:00:00 2015-05-03 00:00:00          3
#  4         2       2 2015-04-12 00:00:00 2015-05-14 00:00:00 32                  7 2015-05-03 00:00:00 2015-05-10 00:00:00          4
#  5         2       2 2015-04-12 00:00:00 2015-05-14 00:00:00 32                  4 2015-05-10 00:00:00 2015-05-14 00:00:00          5
#  6         8      13 2015-06-10 00:00:00 2015-06-18 00:00:00 8                   7 2015-06-10 00:00:00 2015-06-17 00:00:00          1
#  7         8      13 2015-06-10 00:00:00 2015-06-18 00:00:00 8                   1 2015-06-17 00:00:00 2015-06-18 00:00:00          2
#  8        25      12 2015-07-02 00:00:00 2015-07-08 00:00:00 6                   6 2015-07-02 00:00:00 2015-07-08 00:00:00          1
#  9        41      23 2015-07-22 00:00:00 2015-07-24 00:00:00 2                   2 2015-07-22 00:00:00 2015-07-24 00:00:00          1
# 10        59      47 2015-07-29 00:00:00 2015-07-30 00:00:00 1                   1 2015-07-29 00:00:00 2015-07-30 00:00:00          1
# # ... with 609 more rows

How the unnesting works

After the mutate call, the data.frame has just the same amount of rows as before, but it has new columns. These new columns all contain list elements (elements that are lists), we call them list columns.

Then unnest develop these lists vertically and we get our additional rows, elements from columns that were not list columns are just repeated.

from ?unnest : If you have a list-column, this makes each element of the list its own row.

how duration_new is built

We a simple sequence of breaks using seq (see ?seq), seq doesn't give the last element however (which is the complete duration), so we add it with c. The diff of breaks gives the individual durations.

You can execute this step by step (select until before a pipe, ctrl+enter):

duration <- 32
(as.numeric(duration)-1) %>% seq(0,.,by=7) %>% c(duration) %>% diff %>% list

how start_date_new and end_date_new are built

We start from start_date and add the cumulated durations, for start_date_new we add 0 to the first element, so we keep them offset.




回答2:


I apologize for not using dplyr and tidyr but I am much more fluent in data.table syntax:

n <- 7L
library(data.table)
setDT(my_durations)[, {
  tmp <- unique(c(seq(as.Date(start_date), as.Date(end_date), by = paste(n, "days")), 
                  as.Date(end_date)))
  .(segment_id = head(seq_along(tmp), -1L), 
    segment_start = head(tmp, -1L), 
    segment_end = tail(tmp, -1L),
    segment_duration = diff(tmp))
}, by = .(sample_id, site_id, start_date, end_date, duration)] 
     sample_id site_id start_date   end_date duration segment_id segment_start segment_end segment_duration
  1:         2       2 2015-04-12 2015-05-14  32 days          1    2015-04-12  2015-04-19           7 days
  2:         2       2 2015-04-12 2015-05-14  32 days          2    2015-04-19  2015-04-26           7 days
  3:         2       2 2015-04-12 2015-05-14  32 days          3    2015-04-26  2015-05-03           7 days
  4:         2       2 2015-04-12 2015-05-14  32 days          4    2015-05-03  2015-05-10           7 days
  5:         2       2 2015-04-12 2015-05-14  32 days          5    2015-05-10  2015-05-14           4 days
 ---                                                                                                       
615:      2838     364 2017-11-26 2018-02-27  93 days         10    2018-01-28  2018-02-04           7 days
616:      2838     364 2017-11-26 2018-02-27  93 days         11    2018-02-04  2018-02-11           7 days
617:      2838     364 2017-11-26 2018-02-27  93 days         12    2018-02-11  2018-02-18           7 days
618:      2838     364 2017-11-26 2018-02-27  93 days         13    2018-02-18  2018-02-25           7 days
619:      2838     364 2017-11-26 2018-02-27  93 days         14    2018-02-25  2018-02-27           2 days

The OP has requested that segments whose duration is shorter than n should be kept as is with segment_id == 1. This can be verified for the given dataset with n <- 50L where above code returns

     sample_id site_id start_date   end_date duration segment_id segment_start segment_end segment_duration
  1:         2       2 2015-04-12 2015-05-14  32 days          1    2015-04-12  2015-05-14          32 days
  2:         8      13 2015-06-10 2015-06-18   8 days          1    2015-06-10  2015-06-18           8 days
  3:        25      12 2015-07-02 2015-07-08   6 days          1    2015-07-02  2015-07-08           6 days
  4:        41      23 2015-07-22 2015-07-24   2 days          1    2015-07-22  2015-07-24           2 days
  5:        59      47 2015-07-29 2015-07-30   1 days          1    2015-07-29  2015-07-30           1 days
 ---                                                                                                       
308:      2837     350 2017-08-29 2018-02-27 182 days          2    2017-10-18  2017-12-07          50 days
309:      2837     350 2017-08-29 2018-02-27 182 days          3    2017-12-07  2018-01-26          50 days
310:      2837     350 2017-08-29 2018-02-27 182 days          4    2018-01-26  2018-02-27          32 days
311:      2838     364 2017-11-26 2018-02-27  93 days          1    2017-11-26  2018-01-15          50 days
312:      2838     364 2017-11-26 2018-02-27  93 days          2    2018-01-15  2018-02-27          43 days



回答3:


A potential solution. Notice that I converted the duration column to integer for the operation. You can convert it back if you want.

library(tidyverse)
library(lubridate)
# Define the number of days for one duration
num <- 7L

dat2 <- dat %>%
  mutate(duration = as.integer(duration)) %>%
  # Count number of duration and how many days left
  mutate(times = duration %/% num, left = duration %% num) %>%
  # Repeat each row based on number of duration
  slice(rep(row_number(), times = times + 1)) %>%
  group_by(sample_id, site_id) %>%
  # Process the start and end date
  mutate(seg_start_date = if_else(row_number() < n(),
                              start_date + days(num * (n() - 1)),
                              end_date - days(left)),
         seg_end_date = if_else(row_number() < n(),
                            seg_start_date + days(num - 1),
                            end_date)) %>%
  # Create segment id 
  mutate(segment_id = 1:n()) %>%
  # Create segment duration
  mutate(seg_duration = ifelse(row_number() == n(), left, num)) %>%
  ungroup() %>%
  select(-times, -left)

head(dat2) %>% as.data.frame()
#   sample_id site_id start_date   end_date duration seg_start_date seg_end_date segment_id seg_duration
# 1         2       2 2015-04-12 2015-05-14       32     2015-04-12   2015-04-18          1            7
# 2         2       2 2015-04-12 2015-05-14       32     2015-04-19   2015-04-25          2            7
# 3         2       2 2015-04-12 2015-05-14       32     2015-04-26   2015-05-02          3            7
# 4         2       2 2015-04-12 2015-05-14       32     2015-05-03   2015-05-09          4            7
# 5         2       2 2015-04-12 2015-05-14       32     2015-05-10   2015-05-14          5            4
# 6         8      13 2015-06-10 2015-06-18        8     2015-06-10   2015-06-16          1            7


来源:https://stackoverflow.com/questions/51407177/r-lubridate-split-durations-into-sub-durations

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!