For those unfamiliar, one-hot encoding simply refers to converting a column of categories (i.e. a factor) into multiple columns of binary indicator variables where each new
If no one posts a clean way to write this out by hand each time, you can always make a function/macro:
OHE <- function(dt, grp, encodeCols) {
grpSymb = as.symbol(grp)
for (col in encodeCols) {
colSymb = as.symbol(col)
eval(bquote(
dt[, .SD
][, V1 := 1
][, dcast(.SD, .(grpSymb) ~ .(colSymb), fun=sum, value.var='V1')
][, setnames(.SD, setdiff(colnames(.SD), grp), sprintf("%s_%s", col, setdiff(colnames(.SD), grp)))
][, dt <<- dt[.SD, on=grp]
]
))
}
dt
}
dtOHE = OHE(dt, 'ID', c('Color', 'Shape'))
dtOHE
ID Color Shape Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
1: 1 green square 0 1 0 0 1 0
2: 2 red triangle 0 0 1 0 0 1
3: 3 red square 0 0 1 0 1 0
4: 4 blue triangle 1 0 0 0 0 1
5: 5 green cirlce 0 1 0 1 0 0