问题
I am using the R programming language. I am trying to learn how to plot "non linear decision boundaries", such as the examples seen over here: https://stats.stackexchange.com/questions/212965/how-to-achieve-a-nonlinear-decision-boundary . I am trying to learn how to do this both in 2 Dimensions and in 3 Dimensions. I have figured out how to make the basic plots, but I am still struggling to plot the decision boundaries.
To illustrate this example, I have simulated some data. I then split this data into a training and test set. Then, I fit a statistical model (in this case, "random forest") to the training data, and have this model make predictions on the test data. To plot the results, I use a visualization algorithm called "TSNE" which facilitates visualizing large amounts of data (since I have mixed categorical and continuous data, I take the Gower Distance before doing TSNE). I can make these TSNE visualizations in 2 dimensions and in 3 dimensions. Now, I am trying to figure out how to plot the decision boundaries corresponding to the random forest model.
Below, I show the process:
First, I simulate some data:
#load libraries
library(cluster)
library(Rtsne)
library(dplyr)
library(randomForest)
library(caret)
library(ggplot2)
library(plotly)
library(class)
#PART 1 : Create Data
#generate 4 random variables : response_variable ~ var_1 , var_2, var_3
var_1 <- rnorm(1000,1,4)
var_2<-rnorm(1000,10,5)
var_3 <- sample( LETTERS[1:4], 1000, replace=TRUE, prob=c(0.1, 0.2, 0.65, 0.05) )
response_variable <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.1, 0.9) )
#put them into a data frame called "f"
f <- data.frame(var_1, var_2, var_3, response_variable)
#declare var_3 and response_variable as factors
f$response_variable = as.factor(f$response_variable)
f$var_3 = as.factor(f$var_3)
#create id
f$ID <- seq_along(f[,1])
Next, I split the data and run a random forest model:
#PART 2: random forest
#split data into train set and test set
index = createDataPartition(f$response_variable, p=0.7, list = FALSE)
train = f[index,]
test = f[-index,]
#create random forest statistical model
rf = randomForest(response_variable ~ var_1 + var_2 + var_3, data=train, ntree=20, mtry=2)
#have the model predict the test set
pred = predict(rf, test, type = "prob")
labels = as.factor(ifelse(pred[,2]>0.5, "A", "B"))
confusionMatrix(labels, test$response_variable)
After, I make a 2D visualization of the results:
#PART 3: Visualize in 2D (source: https://dpmartin42.github.io/posts/r/cluster-mixed-types)
gower_dist <- daisy(test[, -c(4,5)],
metric = "gower")
gower_mat <- as.matrix(gower_dist)
labels = data.frame(labels)
labels$ID = test$ID
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(labels$labels),
name = labels$ID)
plot = ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = labels$labels))
plotly_plot = ggplotly(plot)
Next, I make a 3D visualization:
#PART 4: Visualize in 3D (source: https://rstudio-pubs-static.s3.amazonaws.com/441420_9a7c15988f3c4f59b2d828eb87ba1634.html)
tsne_obj_3 <- Rtsne(gower_dist, dims=3, is_distance = TRUE)
df_m2 <- as.data.frame(tsne_obj_3$Y)
t <- list(
family = "serif",
size = 11,
color = "#852E2E")
3d_plot <- plot_ly(type = 'scatter3d', mode = 'markers', colors = "Accent", color = labels$labels) %>%
add_trace(
x = df_m2$V1,
y = df_m2$V2,
z = df_m2$V3,
marker = list(
size = 3),
name = labels$ID,
text = paste("ID: ", labels$ID),
showlegend = T
) %>%
layout(
title = "3d t-SNA",
titlefont = list(
size = 10
),
paper_bgcolor = "#fffff8",
font = t,
xaxis = list(
zeroline = F
),
yaxis = list(
hoverformat = '.2f',
zeroline = F
)
)
3d_plot
Here is where I am struggling. I want to now show the 2D decision boundary and the 3D decision boundary on top of each respective plot ("plotly_plot" and "3d_plot"). I am not able to figure out how to do this. Here are my attempts:
# Problem 1: interactive 2D visualization with boundary, something like this: https://stackoverflow.com/questions/24052643/how-to-plot-non-linear-decision-boundaries-with-a-grid-in-r
prob <- attr(labels, "prob")
prob <- ifelse(labels$labels=="A", prob, 1-prob)
prob15 <- matrix(prob, 50)
par(mar=rep(3, 4))
contour(unique(xnew[, 1]), unique(xnew[, 2]), prob15, levels=0.5,
labels="", xlab='', ylab='', axes=FALSE, lwd=2.5, asp=1)
title(xlab=expression(italic('X')[1]), ylab=expression(italic('X')[2]),
line=1, family='serif', cex.lab=1.5)
points(X, bg=ifelse(g==1, "#CA002070", "#0571B070"), pch=21)
gd <- expand.grid(x=unique(xnew[, 1]), y=unique(xnew[, 2]))
points(gd, pch=20, cex=0.4, col=ifelse(prob15 > 0.5, "#CA0020", "#0571B0"))
box()
And:
# Problem 2: interactive 3d visualization, something like this: http://www.semspirit.com/artificial-intelligence/machine-learning/regression/support-vector-regression/support-vector-regression-in-r/
axis_1 = df_m2$V1
axis_2 = df_m2$V2
axis_3 = df_m2$V3
plot_ly(x=as.vector(axis_1),y=as.vector(axis_2),z=axis_3, type="scatter3d", mode="markers", name = "Obs", marker = list(size = 3)) %>%
add_trace(x=as.vector(axis_1),y=as.vector(axis_2),z=labels$labels, type = "mesh3d", name = "Preds")
Can someone please show me how to resolve these problems? Thanks
EDIT : the 2D decision boundary can be plotted as follows, but I have not yet figured out how to make it interactive:
library(cluster)
library(Rtsne)
library(dplyr)
library(randomForest)
library(caret)
library(ggplot2)
library(plotly)
#PART 1 : Create Data
#generate 4 random variables : response_variable ~ var_1 , var_2, var_3
var_1 <- rnorm(10000,1,4)
var_2<-rnorm(10000,10,5)
var_3 <- sample( LETTERS[1:4], 10000, replace=TRUE, prob=c(0.1, 0.2, 0.65, 0.05) )
response_variable <- sample( LETTERS[1:2], 10000, replace=TRUE, prob=c(0.4, 0.6) )
#put them into a data frame called "f"
f <- data.frame(var_1, var_2, var_3, response_variable)
#declare var_3 and response_variable as factors
f$response_variable = as.factor(f$response_variable)
f$var_3 = as.factor(f$var_3)
#create id
f$ID <- seq_along(f[,1])
#PART 2: random forest
#split data into train set and test set
index = createDataPartition(f$response_variable, p=0.7, list = FALSE)
train = f[index,]
test = f[-index,]
#create random forest statistical model
rf = randomForest(response_variable ~ var_1 + var_2 + var_3, data=train, ntree=20, mtry=2)
#have the model predict the test set
pred = predict(rf, test, type = "prob")
labels = as.factor(ifelse(pred[,2]>0.5, "A", "B"))
confusionMatrix(labels, test$response_variable)
#PART 3: Visualize in 2D (source: https://dpmartin42.github.io/posts/r/cluster-mixed-types)
gower_dist <- daisy(test[, -c(4,5)],
metric = "gower")
gower_mat <- as.matrix(gower_dist)
labels = data.frame(labels)
labels$ID = test$ID
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(labels$labels),
name = labels$ID)
plot = ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = labels$labels))
plotly_plot = ggplotly(plot)
a = tsne_obj$Y
a = data.frame(a)
data = a
data$class = labels$labels
decisionplot <- function(model, data, class = NULL, predict_type = "class",
resolution = 100, showgrid = TRUE, ...) {
if(!is.null(class)) cl <- data[,class] else cl <- 1
data <- data[,1:2]
k <- length(unique(cl))
plot(data, col = as.integer(cl)+1L, pch = as.integer(cl)+1L, ...)
# make grid
r <- sapply(data, range, na.rm = TRUE)
xs <- seq(r[1,1], r[2,1], length.out = resolution)
ys <- seq(r[1,2], r[2,2], length.out = resolution)
g <- cbind(rep(xs, each=resolution), rep(ys, time = resolution))
colnames(g) <- colnames(r)
g <- as.data.frame(g)
### guess how to get class labels from predict
### (unfortunately not very consistent between models)
p <- predict(model, g, type = predict_type)
if(is.list(p)) p <- p$class
p <- as.factor(p)
if(showgrid) points(g, col = as.integer(p)+1L, pch = ".")
z <- matrix(as.integer(p), nrow = resolution, byrow = TRUE)
contour(xs, ys, z, add = TRUE, drawlabels = FALSE,
lwd = 2, levels = (1:(k-1))+.5)
invisible(z)
}
model <- randomForest(class ~ ., data=data, mtry=2, ntrees=500)
final_plot = decisionplot(model, data, class = "class", main = "rf (1)")
来源:https://stackoverflow.com/questions/65404432/r-superimpose-stack-plots-on-top-of-each-other-ggplot2-plotly