问题
I'm trying to figure out how to order an R data table based on the chaining of 2 columns.
Here's my sample data.table.
dt <- data.table(id = c('A', 'A', 'A', 'A', 'A')
, col1 = c(7521, 0, 7915, 5222, 5703)
, col2 = c(7907, 5703, 8004, 7521, 5222))
id col1 col2
1: A 7521 7907
2: A 0 5703
3: A 7915 8004
4: A 5222 7521
5: A 5703 5222
I need the row order to start with col1 = 0. The col1 value in row 2 should be equal to the value of col2 in the preceding row, and so on.
Additionally, there generally should always be a matching value that chains the row order. But if not, it should select the closest value (see rows 4 & 5 below).
The outcome I'm looking for is shown below:
id col1 col2
1: A 0 5703
2: A 5703 5222
3: A 5222 7521
4: A 7521 7907
5: A 7915 8004
I think I can write a crazy function to do this.. but I'm wondering if there's an elegant data.table solution.
EDIT
I updated the table to include an additional ID with duplicate rows, and a unique source column:
dt <- data.table(id = c('A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B')
, col1 = c(7521, 0, 7915, 5222, 5703, 1644, 1625, 0, 1625, 1625)
, col2 = c(7907, 5703, 8004, 7521, 5222, 1625, 1625, 1644, 1625, 1505)
, source = c('c', 'b', 'a', 'e', 'd', 'y', 'z', 'x', 'w', 'v'))
id col1 col2 source
1: A 7521 7907 c
2: A 0 5703 b
3: A 7915 8004 a
4: A 5222 7521 e
5: A 5703 5222 d
6: B 1644 1625 y
7: B 1625 1625 z
8: B 0 1644 x
9: B 1625 1625 w
10: B 1625 1505 v
There can be matching values within an ID. See B, rows 7 & 9 above. However, there's a unique source for each row where this data comes from.
The desired output would be:
id col1 col2 source
1: A 0 5703 b
2: A 5703 5222 d
3: A 5222 7521 e
4: A 7521 7907 c
5: A 7915 8004 a
6: B 0 1644 x
7: B 1644 1625 y
8: B 1625 1625 w
9: B 1625 1625 z
10: B 1625 1625 v
In the output, the matching rows, 8 & 9 could be in any order.
Thanks!
回答1:
Here's another approach that:
- Reorders the data which will place the 0 value first.
- Loops through the rest of the values to return the index of where
col2
matchescol1
.
setorder(dt, col1)
neworder = seq_len(nrow(dt))
init = 1L
col1 = dt[['col1']]; col2 = dt[['col2']]
for (i in seq_along(neworder)[-1L]) {
ind = match(col2[init], col1)
if (is.na(ind)) break
neworder[i] = init = ind
}
dt[neworder]
## id col1 col2
## <char> <num> <num>
##1: A 0 5703
##2: A 5703 5222
##3: A 5222 7521
##4: A 7521 7907
##5: A 7915 8004
If you are doing it with grouping, you can wrap the loop within a dt[, .I[{...}, by = id]$V1
to return the indices. Or to make it look better, we can make a function.
recursive_order = function (x, y) {
neworder = seq_len(length(x))
init = 1L
for (i in neworder[-1L]) {
ind = match(y[init], x)
if (is.na(ind)) break
# Multiple matches which means all the maining matches are the same number
if (ind == init) {
inds = which(x %in% y[init])
l = length(inds)
neworder[i:(i + l - 2L)] = inds[-1L]
break
}
neworder[i] = init = ind
}
return(neworder)
}
dt <- data.table(id = c('A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B')
, col1 = c(7521, 0, 7915, 5222, 5703, 1644, 1625, 0, 1625, 1625)
, col2 = c(7907, 5703, 8004, 7521, 5222, 1625, 1625, 1644, 1625, 1505)
, source = c('c', 'b', 'a', 'e', 'd', 'y', 'z', 'x', 'w', 'v'))
setorder(dt, col1)
dt[dt[, .I[recursive_order(col1, col2)], by = id]$V1]
id col1 col2 source
<char> <num> <num> <char>
1: A 0 5703 b
2: A 5703 5222 d
3: A 5222 7521 e
4: A 7521 7907 c
5: A 7915 8004 a
6: B 0 1644 x
7: B 1644 1625 y
8: B 1625 1625 z
9: B 1625 1625 w
10: B 1625 1505 v
回答2:
Here is an option using igraph
with data.table
:
#add id in front of cols to distinguishes them as vertices
cols <- paste0("col", 1L:2L)
dt[, (cols) := lapply(.SD, function(x) paste0(id, x)), .SDcols=cols]
#permutations of root nodes and leaf nodes
chains <- dt[, CJ(root=setdiff(col1, col2), leaf=setdiff(col2, col1)), id]
#find all paths from root nodes to leaf nodes
#note that igraph requires vertices to be of character type
library(igraph)
g <- graph_from_data_frame(dt[, .(col1, col2)])
l <- lapply(unlist(
apply(chains, 1L, function(x) all_simple_paths(g, x[["root"]], x[["leaf"]])),
recursive=FALSE), names)
links <- data.table(g=rep(seq_along(l), lengths(l)), col1=unlist(l))
#look up edges
dt[links, on=.(col1), nomatch=0L]
output:
id col1 col2 source g
1: A A0 A5703 b 1
2: A A5703 A5222 d 1
3: A A5222 A7521 e 1
4: A A7521 A7907 c 1
5: A A7915 A8004 a 2
6: B B0 B1644 x 3
7: B B1644 B1625 y 3
8: B B1625 B1625 z 3
9: B B1625 B1625 w 3
10: B B1625 B1505 v 3
data:
library(data.table)
dt <- data.table(id = c('A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B')
, col1 = c(7521, 0, 7915, 5222, 5703, 1644, 1625, 0, 1625, 1625)
, col2 = c(7907, 5703, 8004, 7521, 5222, 1625, 1625, 1644, 1625, 1505)
, source = c('c', 'b', 'a', 'e', 'd', 'y', 'z', 'x', 'w', 'v'))
回答3:
A chain sort solution in base using Reduce
.
fun <- function(j,k) {
i[j] <<- FALSE
r[i][which.min(abs(x$col2[j] - x$col1[i]))]
}
do.call(rbind, lapply(split(dt, dt$id), function(x) {
assign("x", x, envir = .GlobalEnv)
assign("i", rep(TRUE, nrow(x)), envir = .GlobalEnv)
assign("r", seq_along(i), envir = .GlobalEnv)
x[Reduce(fun, r[-1], which.min(x$col1), accumulate = TRUE),]
}))
# id col1 col2 source
#A.2 A 0 5703 b
#A.5 A 5703 5222 d
#A.4 A 5222 7521 e
#A.1 A 7521 7907 c
#A.3 A 7915 8004 a
#B.8 B 0 1644 x
#B.6 B 1644 1625 y
#B.7 B 1625 1625 z
#B.9 B 1625 1625 w
#B.10 B 1625 1505 v
Or using a for
loop:
fun <- function(init, from, to) {
i <- integer(length(to))
i[1] <- init
j <- seq_along(to)[-init]
for(k in seq_along(i)[-1]) {
x <- which.min(abs(to[i[k-1]] - from[j]))
i[k] <- j[x]
j <- j[-x]
}
i
}
do.call(rbind, lapply(split(dt, dt$id), function(x) {
x[fun(which.min(x$col1), x$col1, x$col2),]}))
# id col1 col2 source
#A.2 A 0 5703 b
#A.5 A 5703 5222 d
#A.4 A 5222 7521 e
#A.1 A 7521 7907 c
#A.3 A 7915 8004 a
#B.8 B 0 1644 x
#B.6 B 1644 1625 y
#B.7 B 1625 1625 z
#B.9 B 1625 1625 w
#B.10 B 1625 1505 v
Data:
dt <- data.frame(id = c('A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B')
, col1 = c(7521, 0, 7915, 5222, 5703, 1644, 1625, 0, 1625, 1625)
, col2 = c(7907, 5703, 8004, 7521, 5222, 1625, 1625, 1644, 1625, 1505)
, source = c('c', 'b', 'a', 'e', 'd', 'y', 'z', 'x', 'w', 'v'))
来源:https://stackoverflow.com/questions/61190256/set-r-data-table-row-order-by-chaining-2-columns