Formula vs non-formula interface in train()

问题

[I looked into similar threads here and in github, and none of the issues suggested by Max and others seem to relate to my case.]

I have seen some here reporting about formula interface failing whereas non-formula interface working fine for them. My problem is the opposite. Thetrain()function below with formula interface works perfect:

glmTune <- train(class ~ .,
                 data = trainData,
                 method = "glmnet",
                 trControl = train.control,
                 tuneGrid = tune.grid)

This one below gives NA errors:

predictors <- trainData[, !(names(trainData) %in% "class")]
response <- trainData$class
glmTune <- train(x = predictors,
                 y = response,
                 method = "glmnet",
                 trControl = train.control,
                 tuneGrid = tune.grid)

This happens with bothglmnetandxgboost,and regardless of whetheryis factor or numeric, butxhas lot of factor variables. Thanks for any help.

Wanted to add, the error for factoryis this:

Something is wrong; all the Accuracy metric values are missing:
    Accuracy       Kappa    
 Min.   : NA   Min.   : NA  
 1st Qu.: NA   1st Qu.: NA  
 Median : NA   Median : NA  
 Mean   :NaN   Mean   :NaN  
 3rd Qu.: NA   3rd Qu.: NA  
 Max.   : NA   Max.   : NA  
 NA's   :243   NA's   :243  
Error: Stopping
In addition: Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.

And for numericalyit is only slightly different (different performance metric):

Something is wrong; all the RMSE metric values are missing:
      RMSE        Rsquared  
 Min.   : NA   Min.   : NA  
 1st Qu.: NA   1st Qu.: NA  
 Median : NA   Median : NA  
 Mean   :NaN   Mean   :NaN  
 3rd Qu.: NA   3rd Qu.: NA  
 Max.   : NA   Max.   : NA  
 NA's   :100   NA's   :100  
Error: Stopping
In addition: Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.

Here is the code:

library(caret)
library(dplyr)
library(glmnet)

# see dput(droplevels(head(df, 20))) output of data below:

# 70%/30% split
set.seed(42)
inTrain <- createDataPartition(df$lnprice, p=0.7, list=F)
trainData <- df[inTrain, ]
testData <- df[-inTrain, ]

# train model
train.control <- trainControl(method = "repeatedcv",
                              number = 10,
                              repeats= 5,
                              allowParallel = F)
tune.grid <- expand.grid(lambda = seq(0.0001,0.1,length=20),
                         alpha = c(0, 0.5, 1))
X <- trainData[, !(names(trainData) %in% "lnprice")]
Y <- trainData$lnprice
fit <- train(
#  x = X, y = Y,                        # non-formula
  lnprice ~ ., data = trainData,       # formula
  method = "glmnet",
  preProcess = c("zv", "center", "scale"),
  tuneGrid = tune.grid,
  trControl = train.control)

# plot model
print(plot(fit))

> dput(droplevels(head(df,20)))
structure(list(fuel.type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gas", class = "factor"), 
    aspiration = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("std", 
    "turbo"), class = "factor"), num.of.doors = structure(c(2L, 
    2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 
    2L, 1L, 2L, 2L), .Label = c("four", "two"), class = "factor"), 
    body.style = structure(c(1L, 1L, 2L, 3L, 3L, 3L, 3L, 4L, 
    3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L), .Label = c("convertible", 
    "hatchback", "sedan", "wagon"), class = "factor"), drive.wheels = structure(c(2L, 
    2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 1L, 1L), .Label = c("fwd", "rwd", "X4wd"), class = "factor"), 
    engine.location = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "front", class = "factor"), 
    wheel.base = c(88.6, 88.6, 94.5, 99.8, 99.4, 99.8, 105.8, 
    105.8, 105.8, 99.5, 101.2, 101.2, 101.2, 101.2, 103.5, 103.5, 
    103.5, 110, 88.4, 94.5), length = c(168.8, 168.8, 171.2, 
    176.6, 176.6, 177.3, 192.7, 192.7, 192.7, 178.2, 176.8, 176.8, 
    176.8, 176.8, 189, 189, 193.8, 197, 141.1, 155.9), width = c(64.1, 
    64.1, 65.5, 66.2, 66.4, 66.3, 71.4, 71.4, 71.4, 67.9, 64.8, 
    64.8, 64.8, 64.8, 66.9, 66.9, 67.9, 70.9, 60.3, 63.6), height = c(48.8, 
    48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9, 52, 54.3, 
    54.3, 54.3, 54.3, 55.7, 55.7, 53.7, 56.3, 53.2, 52), curb.weight = c(2548L, 
    2548L, 2823L, 2337L, 2824L, 2507L, 2844L, 2954L, 3086L, 3053L, 
    2395L, 2395L, 2710L, 2765L, 3055L, 3230L, 3380L, 3505L, 1488L, 
    1874L), engine.type = structure(c(1L, 1L, 4L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L), .Label = c("dohc", 
    "l", "ohc", "ohcv"), class = "factor"), num.of.cylinders = structure(c(2L, 
    2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L), .Label = c("five.six", "four.or.less"), class = "factor"), 
    engine.size = c(130L, 130L, 152L, 109L, 136L, 136L, 136L, 
    136L, 131L, 131L, 108L, 108L, 164L, 164L, 164L, 209L, 209L, 
    209L, 61L, 90L), fuel.system = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L), .Label = c("mpfi", "X2bbl"), class = "factor"), bore = c(3.47, 
    3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13, 3.13, 3.5, 
    3.5, 3.31, 3.31, 3.31, 3.62, 3.62, 3.62, 2.91, 3.03), stroke = c(2.68, 
    2.68, 3.47, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 2.8, 2.8, 
    3.19, 3.19, 3.19, 3.39, 3.39, 3.39, 3.03, 3.11), compression.ratio = c(9, 
    9, 9, 10, 8, 8.5, 8.5, 8.5, 8.3, 7, 8.8, 8.8, 9, 9, 9, 8, 
    8, 8, 9.5, 9.6), horsepower = c(111, 111, 154, 102, 115, 
    110, 110, 110, 140, 160, 101, 101, 121, 121, 121, 182, 182, 
    182, 48, 70), peak.rpm = c(5000L, 5000L, 5000L, 5500L, 5500L, 
    5500L, 5500L, 5500L, 5500L, 5500L, 5800L, 5800L, 4250L, 4250L, 
    4250L, 5400L, 5400L, 5400L, 5100L, 5400L), city.mpg = c(21L, 
    21L, 19L, 24L, 18L, 19L, 19L, 19L, 17L, 16L, 23L, 23L, 21L, 
    21L, 20L, 16L, 16L, 15L, 47L, 38L), highway.mpg = c(27L, 
    27L, 26L, 30L, 22L, 25L, 25L, 25L, 20L, 22L, 29L, 29L, 28L, 
    28L, 25L, 22L, 22L, 20L, 53L, 43L), make = structure(c(1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 4L, 4L), .Label = c("alfa.romero", "audi", "bmw", 
    "chevrolet"), class = "factor"), lnprice = c(9.5101, 9.7111, 
    9.7111, 9.5432, 9.7671, 9.6323, 9.7819, 9.848, 10.0806, 9.69176, 
    9.7069, 9.7365, 9.9508, 9.9573, 10.1091, 10.334, 10.629, 
    10.5154, 8.5469, 8.7475)), .Names = c("fuel.type", "aspiration", 
"num.of.doors", "body.style", "drive.wheels", "engine.location", 
"wheel.base", "length", "width", "height", "curb.weight", "engine.type", 
"num.of.cylinders", "engine.size", "fuel.system", "bore", "stroke", 
"compression.ratio", "horsepower", "peak.rpm", "city.mpg", "highway.mpg", 
"make", "lnprice"), row.names = c(NA, 20L), class = "data.frame")

回答1:

You don't have any NAs in the example data set, but I still can reproduce the error. I think the problem is that the glmnet method requires numeric variables only and train will create dummy variables for factors with the formula method, but not with the x,y specification (see https://github.com/topepo/caret/issues/1051).

Below I use the recipes package to create one-hot encoded variables out of your factor variables. I remove the variables fuel.type and engine.location because they have only one level in your small example data set.

trainData <- trainData %>% select(-fuel.type, -engine.location)
rec <- recipe(lnprice~., data = trainData) %>% step_dummy(all_predictors(), one_hot = TRUE)
rec_prep <- prep(rec, trainData)
train_new <- bake(rec_prep, trainData)
X <- as.data.frame(train_new[, !(names(train_new) %in% "lnprice")])
Y <- train_new$lnprice

fit <- train(
  x = X, y = Y,                        # non-formula
  #lnprice ~ ., data = trainData,       # formula
  method = "glmnet",
  preProcess = c("zv", "center", "scale"),
  tuneGrid = tune.grid,
  trControl = train.control)

You will get a warning, but I believe it is due to the very small data set size: Warning message: "missing values in resampled performance measures" in caret train() using rpart

I prep and bake the recipe for demonstration purposes only, but I believe if you wanted to use this with train you would simply pass the recipe to train along with your data. With the recipe specification train will ignore the preprocessing steps you list, so those should also be added as steps to the recipe, see the recipes package. If you have NAs in your larger data set, you can add a step in the recipe to omit the NAs.

回答2:

Strange. It looks like the train.default method doesn't have a default na.action handler?

Output from ?caret::train

## Default S3 method:
train(x, y, method = "rf", preProcess = NULL, ...,
  weights = NULL, metric = ifelse(is.factor(y), "Accuracy", "RMSE"),
  maximize = ifelse(metric %in% c("RMSE", "logLoss", "MAE"), FALSE, TRUE),
  trControl = trainControl(), tuneGrid = NULL,
  tuneLength = ifelse(trControl$method == "none", 1, 3))

Whereas the train.formula method does:

## S3 method for class 'formula'
train(form, data, ..., weights, subset, na.action = na.fail, contrasts = NULL)
                                        ^^^^^^^^^^^^^^^^^^^

If you add na.action = na.fail to your train.default call, x, y interface, do you get the same behaviour as the train.formula call?

来源：https://stackoverflow.com/questions/47705991/formula-vs-non-formula-interface-in-train

标签

r-caret